Skip to content

Commit

Permalink
Merge branch 'xgb170-2212-updates' of https://github.com/nvliyuan/spa…
Browse files Browse the repository at this point in the history
…rk-rapids-examples into xgb170-2212-updates
  • Loading branch information
nvliyuan committed Dec 15, 2022
2 parents 47fcbff + cc24728 commit 9b85f1f
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 32 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,6 @@
"\n",
"# if you pass/unpack the archive file and enable the environment\n",
"# conf.set(\"spark.yarn.dist.archives\", \"your_pyspark_venv.tar.gz#environment\")\n",
"conf.set(\"spark.executor.resource.gpu.discoveryScript\",\"/your-path/getGpusResources.sh\")\n",
"# Create spark session\n",
"spark = SparkSession.builder.config(conf=conf).getOrCreate()\n",
"\n",
Expand All @@ -146,9 +145,9 @@
"schema = StructType([ StructField(x, FloatType()) for x in [label] + features ])\n",
"\n",
"# You need to update them to your real paths!\n",
"dataRoot = os.getenv(\"DATA_ROOT\", \"/your-path\")\n",
"train_path = dataRoot + \"/your-path\"\n",
"eval_path = dataRoot + \"/your-path\"\n",
"dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n",
"train_path = dataRoot + \"/agaricus/csv/train\"\n",
"eval_path = dataRoot + \"/agaricus/csv/eval\"\n",
"\n",
"data_format = 'csv'\n",
"has_header = 'true'\n",
Expand Down Expand Up @@ -294,7 +293,7 @@
}
],
"source": [
"model.write().overwrite().save(dataRoot + '/your-path')"
"model.write().overwrite().save(dataRoot + '/model/agaricus')"
]
},
{
Expand All @@ -303,7 +302,7 @@
"metadata": {},
"outputs": [],
"source": [
"loaded_model = SparkXGBClassifierModel().load(dataRoot + '/mortgage/model')"
"loaded_model = SparkXGBClassifierModel().load(dataRoot + '/model/agaricus')"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
"import time\n",
"import os\n",
"from pyspark import broadcast\n",
"from pyspark.conf import SparkConf\n",
"from pyspark.sql import SparkSession\n",
"from pyspark.sql.functions import *\n",
"from pyspark.sql.types import *\n",
Expand All @@ -53,7 +54,8 @@
"source": [
"# The input path of dataset\n",
"dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n",
"orig_raw_path = dataRoot + \"/mortgage/input/\"orig_raw_path_csv2parquet = dataRoot + \"/mortgage/output/csv2parquet/\""
"orig_raw_path = dataRoot + \"/mortgage/input/\"\n",
"orig_raw_path_csv2parquet = dataRoot + \"/mortgage/output/csv2parquet/\""
]
},
{
Expand Down Expand Up @@ -926,11 +928,11 @@
"metadata": {},
"outputs": [],
"source": [
"# This sample uses 2 workers(GPUs) to run XGBoost training \n",
"# This sample uses 1 worker(GPU) to run XGBoost training, you can change according to your GPU resources\n",
"params = { \n",
" \"tree_method\": \"gpu_hist\",\n",
" \"grow_policy\": \"depthwise\",\n",
" \"num_workers\": 2,\n",
" \"num_workers\": 1,\n",
" \"use_gpu\": \"true\",\n",
"}\n",
"params['features_col'] = features\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@
"conf.set(\"spark.executor.extraClassPath\", RAPIDS_JAR)\n",
"# if you pass/unpack the archive file and enable the environment\n",
"# conf.set(\"spark.yarn.dist.archives\", \"your_pyspark_venv.tar.gz#environment\")\n",
"conf.set(\"spark.executor.resource.gpu.discoveryScript\",\"/your-path/getGpusResources.sh\")\n",
"# Create spark session\n",
"spark = SparkSession.builder.config(conf=conf).getOrCreate()\n",
"\n",
Expand Down Expand Up @@ -160,11 +159,11 @@
"features = [ x.name for x in schema if x.name != label ]\n",
"\n",
"# You need to update them to your real paths!\n",
"dataRoot = os.getenv(\"DATA_ROOT\", \"/your-path\")\n",
"train_path = dataRoot + \"/your-path\"\n",
"eval_path = dataRoot + \"/your-path\"\n",
"dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n",
"train_path = dataRoot + \"/mortgage/output/train\"\n",
"eval_path = dataRoot + \"/mortgage/output/eval\"\n",
"\n",
"data_format = 'csv'\n",
"data_format = 'parquet'\n",
"has_header = 'true'\n",
"if data_format == 'csv':\n",
" train_data = reader.schema(schema).option('header',has_header).csv(train_path)\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,11 +182,11 @@
"features = [ x.name for x in schema if x.name != label ]\n",
"\n",
"# You need to update them to your real paths!\n",
"dataRoot = os.getenv(\"DATA_ROOT\", \"/your-path\")\n",
"train_path = dataRoot + \"/your-path\"\n",
"eval_path = dataRoot + \"/your-path\"\n",
"dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n",
"train_path = dataRoot + \"/mortgage/output/train\"\n",
"eval_path = dataRoot + \"/mortgage/output/eval\"\n",
"\n",
"data_format = 'csv'\n",
"data_format = 'parquet'\n",
"has_header = 'true'\n",
"if data_format == 'csv':\n",
" train_data = reader.schema(schema).option('header',has_header).csv(train_path)\n",
Expand Down Expand Up @@ -341,7 +341,7 @@
}
],
"source": [
"model.write().overwrite().save(dataRoot + '/your-path')"
"model.write().overwrite().save(dataRoot + '/model/mortgage')"
]
},
{
Expand All @@ -350,7 +350,7 @@
"metadata": {},
"outputs": [],
"source": [
"loaded_model = SparkXGBClassifierModel().load(dataRoot + '/mortgage/model')"
"loaded_model = SparkXGBClassifierModel().load(dataRoot + '/model/mortgage')"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@
"from time import time\n",
"from pyspark.conf import SparkConf\n",
"import os\n",
"os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\"\n",
"os.environ['PYSPARK_DRIVER_PYTHON'] = \"./environment/bin/python\""
"# os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\"\n",
"# os.environ['PYSPARK_DRIVER_PYTHON'] = \"./environment/bin/python\""
]
},
{
Expand Down Expand Up @@ -107,7 +107,6 @@
"conf.set(\"spark.executor.extraClassPath\", RAPIDS_JAR)\n",
"# if you pass/unpack the archive file and enable the environment\n",
"# conf.set(\"spark.yarn.dist.archives\", \"your_pyspark_venv.tar.gz#environment\")\n",
"conf.set(\"spark.executor.resource.gpu.discoveryScript\",\"/your-path/getGpusResources.sh\")\n",
"# Create spark session\n",
"spark = SparkSession.builder.config(conf=conf).getOrCreate()\n",
"\n",
Expand Down Expand Up @@ -150,9 +149,9 @@
"features = [ x.name for x in schema if x.name != label ]\n",
"\n",
"# You need to update them to your real paths!\n",
"dataRoot = os.getenv(\"DATA_ROOT\", \"/your-path\")\n",
"train_path = dataRoot + \"/your-path\"\n",
"eval_path = dataRoot + \"/your-path\"\n",
"dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n",
"train_path = dataRoot + \"/taxi/csv/train\"\n",
"eval_path = dataRoot + \"/taxi/csv/test\"\n",
"\n",
"data_format = 'csv'\n",
"has_header = 'true'\n",
Expand Down
11 changes: 5 additions & 6 deletions examples/XGBoost-Examples/taxi/notebooks/python/taxi-gpu.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,6 @@
"\n",
"# if you pass/unpack the archive file and enable the environment\n",
"# conf.set(\"spark.yarn.dist.archives\", \"your_pyspark_venv.tar.gz#environment\")\n",
"conf.set(\"spark.executor.resource.gpu.discoveryScript\",\"/your-path/getGpusResources.sh\")\n",
"# Create spark session\n",
"spark = SparkSession.builder.config(conf=conf).getOrCreate()\n",
"\n",
Expand Down Expand Up @@ -162,9 +161,9 @@
"features = [ x.name for x in schema if x.name != label ]\n",
"\n",
"# You need to update them to your real paths!\n",
"dataRoot = os.getenv(\"DATA_ROOT\", \"/your-path\")\n",
"train_path = dataRoot + \"/your-path\"\n",
"eval_path = dataRoot + \"/your-path\"\n",
"dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n",
"train_path = dataRoot + \"/taxi/csv/train\"\n",
"eval_path = dataRoot + \"/taxi/csv/test\"\n",
"\n",
"data_format = 'csv'\n",
"has_header = 'true'\n",
Expand Down Expand Up @@ -311,7 +310,7 @@
}
],
"source": [
"model.write().overwrite().save(dataRoot + '/your-path')"
"model.write().overwrite().save(dataRoot + '/model/taxi')"
]
},
{
Expand All @@ -320,7 +319,7 @@
"metadata": {},
"outputs": [],
"source": [
"loaded_model = SparkXGBRegressorModel().load(dataRoot + '/mortgage/model)"
"loaded_model = SparkXGBRegressorModel().load(dataRoot + '/model/taxi')"
]
},
{
Expand Down

0 comments on commit 9b85f1f

Please sign in to comment.