Skip to content

Commit

Permalink
Update docs for the new release (NVIDIA#50)
Browse files Browse the repository at this point in the history
- CUDF 0.9.1
- XGBoost4J 1.0.0-Beta2
  • Loading branch information
chuanlihao authored Sep 27, 2019
1 parent 8553889 commit b1034bf
Show file tree
Hide file tree
Showing 27 changed files with 202 additions and 191 deletions.
3 changes: 0 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
This repo provides docs and example applications that demonstrate the RAPIDS.ai GPU-accelerated XGBoost-Spark project.

*Please note: Spark 2.4.4 support is currently in development. Please run the examples with other Spark 2.4 versions, or Spark 2.3.*

### Examples

- Mortgage: [Scala](/examples/apps/scala/src/main/scala/ai/rapids/spark/examples/mortgage), [Python](/examples/apps/python/ai/rapids/spark/examples/mortgage)
Expand All @@ -28,7 +26,6 @@ You can get a small size datasets for each example in the [datasets](/datasets)
- [EMR](/getting-started-guides/csp/aws/emr.md)
- [SageMaker](/getting-started-guides/csp/aws/sagemaker.md)
- [Databricks](/getting-started-guides/csp/databricks/databricks.md)
- [Azure Databricks](/getting-started-guides/csp/azure-databricks/azure-databricks.md)
- [Google Cloud Platform](/getting-started-guides/csp/gcp/gcp.md)
- Getting started for Jupyter Notebook applications
- [Apache Toree Notebook for Scala](/getting-started-guides/notebook/toree.md)
Expand Down
4 changes: 2 additions & 2 deletions examples/apps/scala/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@

<properties>
<encoding>UTF-8</encoding>
<xgboost.version>1.0.0-Beta</xgboost.version>
<cudf.version>0.9</cudf.version>
<xgboost.version>1.0.0-Beta2</xgboost.version>
<cudf.version>0.9.1</cudf.version>
<spark.version>2.4.0</spark.version>
<scala.version>2.11.6</scala.version>
<scala.binary.version>2.11</scala.binary.version>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,14 @@ object CPUMain {
def schema(length: Int): StructType =
StructType(featureNames(length).map(n => StructField(n, FloatType)))

val xgboostArgs = XGBoostArgs.parse(args)
val dataSchema = schema(126)
val xgboostArgs = XGBoostArgs.parse(args)
val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3)
val appInfo = Seq("Agaricus", processor, xgboostArgs.format)

// build spark session
val objName = this.getClass.getSimpleName.stripSuffix("$")
val spark = SparkSetup(args, "AgaricusAppFor$objName")
spark.sparkContext.setLogLevel("WARN")

val spark = SparkSetup(args, appInfo.mkString("-"))
val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2))
// === diff ===
// build data reader
val dataReader = spark.read
Expand Down Expand Up @@ -74,7 +74,7 @@ object CPUMain {
.setFeaturesCol("features")

println("\n------ Training ------")
val (model, _) = Benchmark.time(s"Agaricus CPU train ${xgboostArgs.format}") {
val (model, _) = benchmark.time("train") {
xgbClassifier.fit(datasets(0).get)
}
// Save model if modelPath exists
Expand All @@ -88,7 +88,7 @@ object CPUMain {
if (xgboostArgs.isToTransform) {
// start transform
println("\n------ Transforming ------")
var (results, _) = Benchmark.time(s"Agaricus CPU transform ${xgboostArgs.format}") {
var (results, _) = benchmark.time("transform") {
val ret = xgbClassificationModel.transform(datasets(2).get).cache()
ret.foreachPartition(_ => ())
ret
Expand All @@ -102,9 +102,11 @@ object CPUMain {

println("\n------Accuracy of Evaluation------")
val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelName)
val accuracy = evaluator.evaluate(results)

println(s"accuracy == $accuracy")
evaluator.evaluate(results) match {
case accuracy if !accuracy.isNaN =>
benchmark.value(accuracy, "Accuracy", "Accuracy for")
// Throw an exception when NaN ?
}
}

spark.close()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,36 +32,26 @@ object GPUMain {
def schema(length: Int): StructType =
StructType(featureNames(length).map(n => StructField(n, FloatType)))

val xgboostArgs = XGBoostArgs.parse(args)
val dataSchema = schema(126)
val xgboostArgs = XGBoostArgs.parse(args)
val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3)
val appInfo = Seq("Agaricus", processor, xgboostArgs.format)

// build spark session
val objName = this.getClass.getSimpleName.stripSuffix("$")
val spark = SparkSetup(args, "AgaricusAppFor$objName")
spark.sparkContext.setLogLevel("WARN")

val spark = SparkSetup(args, appInfo.mkString("-"))
val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2))
// === diff ===
// build data reader
val dataReader = new GpuDataReader(spark)
.option("asFloats", xgboostArgs.asFloats).option("maxRowsPerChunk", xgboostArgs.maxRowsPerChunk)

// load datasets, the order is (train, train-eval, eval)
var datasets = xgboostArgs.dataPaths.map(_.map{
path =>
xgboostArgs.format match {
case "csv" => dataReader
.option("header", xgboostArgs.hasHeader)
.option("asFloats", xgboostArgs.asFloats)
.option("maxRowsPerChunk", xgboostArgs.maxRowsPerChunk)
.schema(dataSchema)
.csv(path)
case "parquet" => dataReader
.option("asFloats", xgboostArgs.asFloats)
.option("maxRowsPerChunk", xgboostArgs.maxRowsPerChunk)
.parquet(path)
case "orc" => dataReader
.option("asFloats", xgboostArgs.asFloats)
.option("maxRowsPerChunk", xgboostArgs.maxRowsPerChunk)
.orc(path)
case "csv" => dataReader.option("header", xgboostArgs.hasHeader).schema(dataSchema).csv(path)
case "parquet" => dataReader.parquet(path)
case "orc" => dataReader.orc(path)
case _ => throw new IllegalArgumentException("Unsupported data file format!")
}
})
Expand All @@ -85,7 +75,7 @@ object GPUMain {
.setFeaturesCols(featureCols)

println("\n------ Training ------")
val (model, _) = Benchmark.time(s"Agaricus GPU train ${xgboostArgs.format}") {
val (model, _) = benchmark.time("train") {
xgbClassifier.fit(datasets(0).get)
}
// Save model if modelPath exists
Expand All @@ -99,7 +89,7 @@ object GPUMain {
if (xgboostArgs.isToTransform) {
// start transform
println("\n------ Transforming ------")
var (results, _) = Benchmark.time(s"Agaricus GPU transform ${xgboostArgs.format}") {
var (results, _) = benchmark.time("transform") {
val ret = xgbClassificationModel.transform(datasets(2).get).cache()
ret.foreachPartition(_ => ())
ret
Expand All @@ -113,9 +103,11 @@ object GPUMain {

println("\n------Accuracy of Evaluation------")
val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelName)
val accuracy = evaluator.evaluate(results)

println(s"accuracy == $accuracy")
evaluator.evaluate(results) match {
case accuracy if !accuracy.isNaN =>
benchmark.value(accuracy, "Accuracy", "Accuracy for")
// Throw an exception when NaN ?
}
}

spark.close()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,15 @@ object CPUMain extends Mortgage {

def main(args: Array[String]): Unit = {
val xgboostArgs = XGBoostArgs.parse(args)
val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3)
val appInfo = Seq(appName, processor, xgboostArgs.format)

// build spark session
val objName = this.getClass.getSimpleName.stripSuffix("$")
val spark = SparkSession.builder()
.appName(s"Mortgage-$objName-${xgboostArgs.format}")
.appName(appInfo.mkString("-"))
.getOrCreate()

val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2))
// === diff ===
// build data reader
val dataReader = spark.read
Expand Down Expand Up @@ -66,7 +68,8 @@ object CPUMain extends Mortgage {

// Start training
println("\n------ Training ------")
val (model, _) = Benchmark.time(s"Mortgage CPU train ${xgboostArgs.format}") {
// Shall we not log the time if it is abnormal, which is usually caused by training failure
val (model, _) = benchmark.time("train") {
xgbClassifier.fit(datasets(0).get)
}
// Save model if modelPath exists
Expand All @@ -79,7 +82,7 @@ object CPUMain extends Mortgage {

if (xgboostArgs.isToTransform) {
println("\n------ Transforming ------")
var (results, _) = Benchmark.time(s"Mortgage CPU transform ${xgboostArgs.format}") {
var (results, _) = benchmark.time("transform") {
val ret = xgbClassificationModel.transform(datasets(2).get).cache()
// Trigger the transformation
ret.foreachPartition(_ => ())
Expand All @@ -94,8 +97,11 @@ object CPUMain extends Mortgage {

println("\n------Accuracy of Evaluation------")
val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelColName)
val accuracy = evaluator.evaluate(results)
println("Accuracy: " + accuracy)
evaluator.evaluate(results) match {
case accuracy if !accuracy.isNaN =>
benchmark.value(accuracy, "Accuracy", "Accuracy for")
// Throw an exception when NaN ?
}
}

spark.close()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,35 +26,27 @@ object GPUMain extends Mortgage {

def main(args: Array[String]): Unit = {
val xgboostArgs = XGBoostArgs.parse(args)
val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3)
val appInfo = Seq(appName, processor, xgboostArgs.format)

// build spark session
val objName = this.getClass.getSimpleName.stripSuffix("$")
val spark = SparkSession.builder()
.appName(s"Mortgage-$objName-${xgboostArgs.format}")
.appName(appInfo.mkString("-"))
.getOrCreate()

val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2))
// === diff ===
// build data reader
val dataReader = new GpuDataReader(spark)
.option("asFloats", xgboostArgs.asFloats).option("maxRowsPerChunk", xgboostArgs.maxRowsPerChunk)

// load datasets, the order is (train, train-eval, eval)
var datasets = xgboostArgs.dataPaths.map(_.map{
path =>
xgboostArgs.format match {
case "csv" => dataReader
.option("header", xgboostArgs.hasHeader)
.option("asFloats", xgboostArgs.asFloats)
.option("maxRowsPerChunk", xgboostArgs.maxRowsPerChunk)
.schema(schema)
.csv(path)
case "parquet" => dataReader
.option("asFloats", xgboostArgs.asFloats)
.option("maxRowsPerChunk", xgboostArgs.maxRowsPerChunk)
.parquet(path)
case "orc" => dataReader
.option("asFloats", xgboostArgs.asFloats)
.option("maxRowsPerChunk", xgboostArgs.maxRowsPerChunk)
.orc(path)
case "csv" => dataReader.option("header", xgboostArgs.hasHeader).schema(schema).csv(path)
case "parquet" => dataReader.parquet(path)
case "orc" => dataReader.orc(path)
case _ => throw new IllegalArgumentException("Unsupported data file format!")
}
})
Expand All @@ -77,7 +69,8 @@ object GPUMain extends Mortgage {

// Start training
println("\n------ Training ------")
val (model, _) = Benchmark.time(s"Mortgage GPU train ${xgboostArgs.format}") {
// Shall we not log the time if it is abnormal, which is usually caused by training failure
val (model, _) = benchmark.time("train") {
xgbClassifier.fit(datasets(0).get)
}
// Save model if modelPath exists
Expand All @@ -90,7 +83,7 @@ object GPUMain extends Mortgage {

if (xgboostArgs.isToTransform) {
println("\n------ Transforming ------")
var (results, _) = Benchmark.time(s"Mortgage GPU transform ${xgboostArgs.format}") {
var (results, _) = benchmark.time("transform") {
val ret = xgbClassificationModel.transform(datasets(2).get).cache()
// Trigger the transformation
ret.foreachPartition(_ => ())
Expand All @@ -105,8 +98,11 @@ object GPUMain extends Mortgage {

println("\n------Accuracy of Evaluation------")
val evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelColName)
val accuracy = evaluator.evaluate(results)
println("Accuracy: " + accuracy)
evaluator.evaluate(results) match {
case accuracy if !accuracy.isNaN =>
benchmark.value(accuracy, "Accuracy", "Accuracy for")
// Throw an exception when NaN ?
}
}

spark.close()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package ai.rapids.spark.examples.mortgage
import org.apache.spark.sql.types.{FloatType, IntegerType, StructField, StructType}

private[mortgage] trait Mortgage {
val appName = "Mortgage"
val labelColName = "delinquency_12"

val schema = StructType(List(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,15 @@ object CPUMain extends Taxi {

def main(args: Array[String]): Unit = {
val xgboostArgs = XGBoostArgs.parse(args)
val processor = this.getClass.getSimpleName.stripSuffix("$").substring(0, 3)
val appInfo = Seq(appName, processor, xgboostArgs.format)

// build spark session
val objName = this.getClass.getSimpleName.stripSuffix("$")
val spark = SparkSession.builder()
.appName(s"Taxi-$objName-${xgboostArgs.format}")
.appName(appInfo.mkString("-"))
.getOrCreate()

val benchmark = Benchmark(appInfo(0), appInfo(1), appInfo(2))
// === diff ===
// build data reader
val dataReader = spark.read
Expand Down Expand Up @@ -65,7 +67,8 @@ object CPUMain extends Taxi {
.setFeaturesCol("features")

println("\n------ Training ------")
val (model, _) = Benchmark.time(s"Taxi CPU train ${xgboostArgs.format}") {
// Shall we not log the time if it is abnormal, which is usually caused by training failure
val (model, _) = benchmark.time("train") {
xgbRegressor.fit(datasets(0).get)
}
// Save model if modelPath exists
Expand All @@ -78,7 +81,7 @@ object CPUMain extends Taxi {

if (xgboostArgs.isToTransform) {
println("\n------ Transforming ------")
var (prediction, _) = Benchmark.time(s"Taxi CPU transform ${xgboostArgs.format}") {
var (prediction, _) = benchmark.time("transform") {
val ret = xgbRegressionModel.transform(datasets(2).get).cache()
ret.foreachPartition(_ => ())
ret
Expand All @@ -92,10 +95,10 @@ object CPUMain extends Taxi {

println("\n------Accuracy of Evaluation------")
val evaluator = new RegressionEvaluator().setLabelCol(labelColName)
val (rmse, _) = Benchmark.time(s"Taxi CPU evaluation ${xgboostArgs.format}") {
evaluator.evaluate(prediction)
evaluator.evaluate(prediction) match {
case rmse if !rmse.isNaN => benchmark.value(rmse, "RMSE", "RMSE for")
// Throw an exception when NaN ?
}
println(s"RMSE == $rmse")
}

spark.close()
Expand Down
Loading

0 comments on commit b1034bf

Please sign in to comment.