diff --git a/datasets/mortgage-small.tar.gz b/datasets/mortgage-small.tar.gz
deleted file mode 100644
index 2f7c6a016..000000000
Binary files a/datasets/mortgage-small.tar.gz and /dev/null differ
diff --git a/docs/get-started/xgboost-examples/building-sample-apps/python.md b/docs/get-started/xgboost-examples/building-sample-apps/python.md
index 28f35d40d..53f4e66c6 100644
--- a/docs/get-started/xgboost-examples/building-sample-apps/python.md
+++ b/docs/get-started/xgboost-examples/building-sample-apps/python.md
@@ -17,7 +17,8 @@ Two files are required by PySpark:
+ *samples.zip*
- the package including all example code
+ the package including all example code.
+ Executing the above build commands generates the samples.zip file in 'spark-rapids-examples/examples/XGBoost-Examples' folder
+ *main.py*
diff --git a/docs/get-started/xgboost-examples/dataset/mortgage.md b/docs/get-started/xgboost-examples/dataset/mortgage.md
new file mode 100644
index 000000000..1c36155fa
--- /dev/null
+++ b/docs/get-started/xgboost-examples/dataset/mortgage.md
@@ -0,0 +1,22 @@
+# How to download the Mortgage dataset
+
+
+
+## Steps to download the data
+
+1. Go to the [Fannie Mae](https://capitalmarkets.fanniemae.com/credit-risk-transfer/single-family-credit-risk-transfer/fannie-mae-single-family-loan-performance-data) website
+2. Click on [Single-Family Loan Performance Data](https://datadynamics.fanniemae.com/data-dynamics/?&_ga=2.181456292.2043790680.1657122341-289272350.1655822609#/reportMenu;category=HP)
+ * Register as a new user if you are using the website for the first time
+ * Use the credentials to login
+3. Select [HP](https://datadynamics.fanniemae.com/data-dynamics/#/reportMenu;category=HP)
+4. Click on **Download Data** and choose *Single-Family Loan Performance Data*
+5. You will find a tabular list of 'Acquisition and Performance' files sorted based on year and quarter. Click on the file to download `Eg: 2017Q1.zip`
+6. Unzip the downlad file to extract the csv file `Eg: 2017Q1.csv`
+7. Copy only the csv files to a new folder for the ETL to read
+
+## Notes
+1. Refer to the [Loan Performance Data Tutorial](https://capitalmarkets.fanniemae.com/media/9066/display) for more details.
+2. Note that *Single-Family Loan Performance Data* has 2 componenets. However, the Mortgage ETL requires only the first one (primary dataset)
+ * Primary Dataset: Acquisition and Performance Files
+ * HARP Dataset
+3. Use the [Resources](https://datadynamics.fanniemae.com/data-dynamics/#/resources/HP) section to know more about the dataset
\ No newline at end of file
diff --git a/docs/get-started/xgboost-examples/notebook/python-notebook.md b/docs/get-started/xgboost-examples/notebook/python-notebook.md
index 94486d58c..037353254 100644
--- a/docs/get-started/xgboost-examples/notebook/python-notebook.md
+++ b/docs/get-started/xgboost-examples/notebook/python-notebook.md
@@ -30,6 +30,8 @@ and the home directory for Apache Spark respectively.
--conf spark.plugins=com.nvidia.spark.SQLPlugin \
--conf spark.rapids.memory.gpu.pooling.enabled=false \
--conf spark.executor.resource.gpu.amount=1 \
+ --conf spark.sql.cache.serializer=com.nvidia.spark.ParquetCachedBatchSerializer \
+ --conf spark.rapids.sql.hasNans=false \
--conf spark.task.resource.gpu.amount=1 \
--conf spark.executor.resource.gpu.discoveryScript=./getGpusResources.sh \
--files $SPARK_HOME/examples/src/main/scripts/getGpusResources.sh
diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md b/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md
index 887c39d02..5b46b51e6 100644
--- a/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md
+++ b/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md
@@ -60,9 +60,10 @@ on cluster filesystems like HDFS, or in [object stores like S3 and GCS](https://
Note that using [application dependencies](https://spark.apache.org/docs/latest/running-on-kubernetes.html#dependency-management) from
the submission client’s local file system is currently not yet supported.
-Note: the `mortgage_eval_merged.csv` and `mortgage_train_merged.csv` are not Mortgage raw data,
-they are the data produced by Mortgage ETL job. If user wants to use a larger size Mortgage data, please refer to [Launch ETL job](#etl).
-Taxi ETL job is the same. But Agaricus does not have ETL process, it is combined with XGBoost as there is just a filter operation.
+#### Note:
+1. Mortgage and Taxi jobs have ETLs to generate the processed data.
+2. For convenience, a subset of [Taxi](/datasets/) dataset is made available in this repo that can be readily used for launching XGBoost job. Use [ETL](#etl) to generate larger datasets for trainig and testing.
+3. Agaricus does not have an ETL process, it is combined with XGBoost as there is just a filter operation.
Save Kubernetes Template Resources
----------------------------------
@@ -89,16 +90,20 @@ to execute using a GPU which is already in use -- causing undefined behavior and
Launch Mortgage or Taxi ETL Part
---------------------------
+Use the ETL app to process raw Mortgage data. You can either use this ETLed data to split into training and evaluation data or run the ETL on different subsets of the dataset to produce training and evaluation datasets.
Run spark-submit
``` bash
${SPARK_HOME}/bin/spark-submit \
--conf spark.plugins=com.nvidia.spark.SQLPlugin \
- --conf spark.rapids.memory.gpu.pooling.enabled=false \
--conf spark.executor.resource.gpu.amount=1 \
--conf spark.task.resource.gpu.amount=1 \
+ --conf spark.rapids.sql.incompatibleDateFormats.enabled=true \
+ --conf spark.rapids.sql.csv.read.double.enabled=true \
--conf spark.executor.resource.gpu.discoveryScript=./getGpusResources.sh \
+ --conf spark.sql.cache.serializer=com.nvidia.spark.ParquetCachedBatchSerializer \
+ --conf spark.rapids.sql.hasNans=false \
--files $SPARK_HOME/examples/src/main/scripts/getGpusResources.sh \
--jars ${RAPIDS_JAR} \
--master \
@@ -106,18 +111,15 @@ ${SPARK_HOME}/bin/spark-submit \
--num-executors ${SPARK_NUM_EXECUTORS} \
--driver-memory ${SPARK_DRIVER_MEMORY} \
--executor-memory ${SPARK_EXECUTOR_MEMORY} \
- --class ${EXAMPLE_CLASS} \
--class com.nvidia.spark.examples.mortgage.ETLMain \
$SAMPLE_JAR \
-format=csv \
- -dataPath="perf::${SPARK_XGBOOST_DIR}/mortgage/perf-train/" \
- -dataPath="acq::${SPARK_XGBOOST_DIR}/mortgage/acq-train/" \
- -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/out/train/"
-
-# if generating eval data, change the data path to eval as well as the corresponding perf-eval and acq-eval data
-# -dataPath="perf::${SPARK_XGBOOST_DIR}/mortgage/perf-eval"
-# -dataPath="acq::${SPARK_XGBOOST_DIR}/mortgage/acq-eval"
-# -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/out/eval/"
+ -dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/" \
+ -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/train/"
+
+# if generating eval data, change the data path to eval
+# -dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/"
+# -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/eval/"
# if running Taxi ETL benchmark, change the class and data path params to
# -class com.nvidia.spark.examples.taxi.ETLMain
# -dataPath="raw::${SPARK_XGBOOST_DIR}/taxi/your-path"
@@ -163,9 +165,9 @@ export SPARK_DRIVER_MEMORY=4g
export SPARK_EXECUTOR_MEMORY=8g
# example class to use
-export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.GPUMain
-# or change to com.nvidia.spark.examples.taxi.GPUMain to run Taxi Xgboost benchmark
-# or change to com.nvidia.spark.examples.agaricus.GPUMain to run Agaricus Xgboost benchmark
+export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.Main
+# or change to com.nvidia.spark.examples.taxi.Main to run Taxi Xgboost benchmark
+# or change to com.nvidia.spark.examples.agaricus.Main to run Agaricus Xgboost benchmark
# tree construction algorithm
export TREE_METHOD=gpu_hist
@@ -192,9 +194,9 @@ ${SPARK_HOME}/bin/spark-submit
--conf spark.kubernetes.executor.podTemplateFile=${TEMPLATE_PATH} \
--conf spark.kubernetes.authenticate.driver.serviceAccountName=spark \
${SAMPLE_JAR} \
- -dataPath=train::${DATA_PATH}/mortgage/csv/train/mortgage_train_merged.csv \
- -dataPath=trans::${DATA_PATH}/mortgage/csv/test/mortgage_eval_merged.csv \
- -format=csv \
+ -dataPath=train::${SPARK_XGBOOST_DIR}/mortgage/output/train/ \
+ -dataPath=trans::${SPARK_XGBOOST_DIR}/mortgage/output/eval/ \
+ -format=parquet \
-numWorkers=${SPARK_NUM_EXECUTORS} \
-treeMethod=${TREE_METHOD} \
-numRound=100 \
diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md b/docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md
index 55ac2a1c4..ae06e201c 100644
--- a/docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md
+++ b/docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md
@@ -53,6 +53,13 @@ Get Application Files, Jar and Dataset
Make sure you have prepared the necessary packages and dataset by following this [guide](/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md)
+
+#### Note:
+1. Mortgage and Taxi jobs have ETLs to generate the processed data.
+2. For convenience, a subset of [Taxi](/datasets/) dataset is made available in this repo that can be readily used for launching XGBoost job. Use [ETL](#etl) to generate larger datasets for trainig and testing.
+3. Agaricus does not have an ETL process, it is combined with XGBoost as there is just a filter operation.
+
+
Launch a Standalone Spark Cluster
---------------------------------
@@ -83,9 +90,8 @@ Launch a Standalone Spark Cluster
Launch Mortgage or Taxi ETL Part
---------------------------
-
-Run spark-submit
-
+Use the ETL app to process raw Mortgage data. You can either use this ETLed data to split into training and evaluation data or run the ETL on different subsets of the dataset to produce training and evaluation datasets.
+### ETL on GPU
``` bash
${SPARK_HOME}/bin/spark-submit \
--master spark://$HOSTNAME:7077 \
@@ -95,18 +101,39 @@ ${SPARK_HOME}/bin/spark-submit \
--conf spark.plugins=com.nvidia.spark.SQLPlugin \
--conf spark.rapids.sql.incompatibleDateFormats.enabled=true \
--conf spark.rapids.sql.csv.read.double.enabled=true \
+ --conf spark.sql.cache.serializer=com.nvidia.spark.ParquetCachedBatchSerializer \
+ --conf spark.rapids.sql.hasNans=false \
--py-files ${SAMPLE_ZIP} \
main.py \
--mainClass='com.nvidia.spark.examples.mortgage.etl_main' \
--format=csv \
- --dataPath="perf::${SPARK_XGBOOST_DIR}/mortgage/perf-train/" \
- --dataPath="acq::${SPARK_XGBOOST_DIR}/mortgage/acq-train/" \
- --dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/out/train/"
-
-# if generating eval data, change the data path to eval as well as the corresponding perf-eval and acq-eval data
-# --dataPath="perf::${SPARK_XGBOOST_DIR}/mortgage/perf-eval"
-# --dataPath="acq::${SPARK_XGBOOST_DIR}/mortgage/acq-eval"
-# --dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/out/eval/"
+ --dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/" \
+ --dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/train/"
+
+# if generating eval data, change the data path to eval
+# --dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/"
+# --dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/eval/"
+# if running Taxi ETL benchmark, change the class and data path params to
+# -class com.nvidia.spark.examples.taxi.ETLMain
+# -dataPath="raw::${SPARK_XGBOOST_DIR}/taxi/your-path"
+# -dataPath="out::${SPARK_XGBOOST_DIR}/taxi/your-path"
+```
+### ETL on CPU
+```bash
+${SPARK_HOME}/bin/spark-submit \
+ --master spark://$HOSTNAME:7077 \
+ --executor-memory 32G \
+ --conf spark.executor.instances=1 \
+ --py-files ${SAMPLE_ZIP} \
+ main.py \
+ --mainClass='com.nvidia.spark.examples.mortgage.etl_main' \
+ --format=csv \
+ --dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/" \
+ --dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/train/"
+
+# if generating eval data, change the data path to eval
+# --dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/"
+# --dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/eval/"
# if running Taxi ETL benchmark, change the class and data path params to
# -class com.nvidia.spark.examples.taxi.ETLMain
# -dataPath="raw::${SPARK_XGBOOST_DIR}/taxi/your-path"
@@ -166,8 +193,8 @@ ${SPARK_HOME}/bin/spark-submit
--py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP} \
${MAIN_PY} \
--mainClass=${EXAMPLE_CLASS} \
- --dataPath=train::${SPARK_XGBOOST_DIR}/mortgage/out/train/ \
- --dataPath=trans::${SPARK_XGBOOST_DIR}/mortgage/out/eval/ \
+ --dataPath=train::${SPARK_XGBOOST_DIR}/mortgage/output/train/ \
+ --dataPath=trans::${SPARK_XGBOOST_DIR}/mortgage/output/eval/ \
--format=parquet \
--numWorkers=${SPARK_NUM_EXECUTORS} \
--treeMethod=${TREE_METHOD} \
@@ -240,8 +267,8 @@ ${SPARK_HOME}/bin/spark-submit
--py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP} \
${SPARK_PYTHON_ENTRYPOINT} \
--mainClass=${EXAMPLE_CLASS} \
- --dataPath=train::${DATA_PATH}/mortgage/out/train/ \
- --dataPath=trans::${DATA_PATH}/mortgage/out/eval/ \
+ --dataPath=train::${DATA_PATH}/mortgage/output/train/ \
+ --dataPath=trans::${DATA_PATH}/mortgage/output/eval/ \
--format=parquet \
--numWorkers=${SPARK_NUM_EXECUTORS} \
--treeMethod=${TREE_METHOD} \
diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/standalone-scala.md b/docs/get-started/xgboost-examples/on-prem-cluster/standalone-scala.md
index 5493340c2..35a2ed238 100644
--- a/docs/get-started/xgboost-examples/on-prem-cluster/standalone-scala.md
+++ b/docs/get-started/xgboost-examples/on-prem-cluster/standalone-scala.md
@@ -53,9 +53,11 @@ Get Jars and Dataset
Make sure you have prepared the necessary packages and dataset
by following this [guide](/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md)
-Note: the `mortgage_eval_merged.csv` and `mortgage_train_merged.csv` are not Mortgage raw data,
-they are the data produced by Mortgage ETL job. If user wants to use a larger size Mortgage data, please refer to [Launch ETL job](#etl).
-Taxi ETL job is the same. But Agaricus does not have ETL process, it is combined with XGBoost as there is just a filter operation.
+#### Note:
+1. Mortgage and Taxi jobs have ETLs to generate the processed data.
+2. For convenience, a subset of [Taxi](/datasets/) dataset is made available in this repo that can be readily used for launching XGBoost job. Use [ETL](#etl) to generate larger datasets for trainig and testing.
+3. Agaricus does not have an ETL process, it is combined with XGBoost as there is just a filter operation.
+
Launch a Standalone Spark Cluster
---------------------------------
@@ -90,31 +92,53 @@ Launch a Standalone Spark Cluster
Launch Mortgage or Taxi ETL Part
---------------------------
-If user wants to use a larger size dataset other than the default one, we provide an ETL app to process raw Mortgage data.
-
+Use the ETL app to process raw Mortgage data. You can either use this ETLed data to split into training and evaluation data or run the ETL on different subsets of the dataset to produce training and evaluation datasets.
Run spark-submit
+### ETL on GPU
``` bash
${SPARK_HOME}/bin/spark-submit \
--master spark://$HOSTNAME:7077 \
--executor-memory 32G \
- --conf spark.rapids.memory.gpu.pooling.enabled=false \
--conf spark.executor.resource.gpu.amount=1 \
--conf spark.task.resource.gpu.amount=1 \
--conf spark.plugins=com.nvidia.spark.SQLPlugin \
--conf spark.rapids.sql.incompatibleDateFormats.enabled=true \
--conf spark.rapids.sql.csv.read.double.enabled=true \
+ --conf spark.sql.cache.serializer=com.nvidia.spark.ParquetCachedBatchSerializer \
+ --conf spark.rapids.sql.hasNans=false \
--class com.nvidia.spark.examples.mortgage.ETLMain \
$SAMPLE_JAR \
-format=csv \
- -dataPath="perf::${SPARK_XGBOOST_DIR}/mortgage/perf-train/" \
- -dataPath="acq::${SPARK_XGBOOST_DIR}/mortgage/acq-train/" \
- -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/out/train/"
-
-# if generating eval data, change the data path to eval as well as the corresponding perf-eval and acq-eval data
-# -dataPath="perf::${SPARK_XGBOOST_DIR}/mortgage/perf-eval"
-# -dataPath="acq::${SPARK_XGBOOST_DIR}/mortgage/acq-eval"
-# -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/out/eval/"
+ -dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/" \
+ -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/train/"
+
+# if generating eval data, change the data path to eval
+# -dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/"
+# -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/eval/"
+# if running Taxi ETL benchmark, change the class and data path params to
+# -class com.nvidia.spark.examples.taxi.ETLMain
+# -dataPath="raw::${SPARK_XGBOOST_DIR}/taxi/your-path"
+# -dataPath="out::${SPARK_XGBOOST_DIR}/taxi/your-path"
+```
+
+### ETL on CPU
+
+```bash
+${SPARK_HOME}/bin/spark-submit \
+--master spark://$HOSTNAME:7077 \
+--executor-memory 32G \
+--conf spark.executor.instances=1 \
+--conf spark.sql.broadcastTimeout=700 \
+--class com.nvidia.spark.examples.mortgage.ETLMain \
+$SAMPLE_JAR \
+-format=csv \
+-dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/" \
+-dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/train/"
+
+# if generating eval data, change the data path to eval
+# -dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/"
+# -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/eval/"
# if running Taxi ETL benchmark, change the class and data path params to
# -class com.nvidia.spark.examples.taxi.ETLMain
# -dataPath="raw::${SPARK_XGBOOST_DIR}/taxi/your-path"
@@ -150,7 +174,7 @@ export SPARK_DRIVER_MEMORY=4g
export SPARK_EXECUTOR_MEMORY=8g
# example class to use
-export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.GPUMain
+export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.Main
# or change to com.nvidia.spark.examples.taxi.GPUMain to run Taxi Xgboost benchmark
# or change to com.nvidia.spark.examples.agaricus.GPUMain to run Agaricus Xgboost benchmark
@@ -172,9 +196,9 @@ ${SPARK_HOME}/bin/spark-submit
--conf spark.cores.max=${TOTAL_CORES} \
--class ${EXAMPLE_CLASS} \
${SAMPLE_JAR} \
- -dataPath=train::${SPARK_XGBOOST_DIR}/mortgage/csv/train/mortgage_train_merged.csv \
- -dataPath=trans::${SPARK_XGBOOST_DIR}/mortgage/csv/test/mortgage_eval_merged.csv \
- -format=csv \
+ -dataPath=train::${SPARK_XGBOOST_DIR}/mortgage/output/train/ \
+ -dataPath=trans::${SPARK_XGBOOST_DIR}/mortgage/output/eval/ \
+ -format=parquet \
-numWorkers=${SPARK_NUM_EXECUTORS} \
-treeMethod=${TREE_METHOD} \
-numRound=100 \
@@ -229,7 +253,7 @@ export SPARK_DRIVER_MEMORY=4g
export SPARK_EXECUTOR_MEMORY=8g
# example class to use
-export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.CPUMain
+export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.Main
# Please make sure to change the class while running Taxi or Agaricus benchmark
# tree construction algorithm
@@ -238,7 +262,7 @@ export TREE_METHOD=hist
This is the same command as for the GPU example, repeated for convenience:
-``` bash
+```bash
${SPARK_HOME}/bin/spark-submit \
--master ${SPARK_MASTER} \
--driver-memory ${SPARK_DRIVER_MEMORY} \
@@ -246,9 +270,9 @@ ${SPARK_HOME}/bin/spark-submit
--conf spark.cores.max=${TOTAL_CORES} \
--class ${EXAMPLE_CLASS} \
${SAMPLE_JAR} \
- -dataPath=train::${SPARK_XGBOOST_DIR}/mortgage/csv/train/mortgage_train_merged.csv \
- -dataPath=trans::${SPARK_XGBOOST_DIR}/mortgage/csv/test/mortgage_eval_merged.csv \
- -format=csv \
+ -dataPath=train::${SPARK_XGBOOST_DIR}/mortgage/output/train/ \
+ -dataPath=trans::${SPARK_XGBOOST_DIR}/mortgage/output/eval/ \
+ -format=parquet \
-numWorkers=${SPARK_NUM_EXECUTORS} \
-treeMethod=${TREE_METHOD} \
-numRound=100 \
diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md b/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md
index 7966791a2..fe0605aef 100644
--- a/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md
+++ b/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md
@@ -47,25 +47,28 @@ Then create a directory in HDFS, and run below commands,
Launch Mortgage or Taxi ETL Part
---------------------------
-Run spark-submit:
-
+Use the ETL app to process raw Mortgage data. You can either use this ETLed data to split into training and evaluation data or run the ETL on different subsets of the dataset to produce training and evaluation datasets.
``` bash
# location where data was downloaded
export DATA_PATH=hdfs:/tmp/xgboost4j_spark_python/
${SPARK_HOME}/bin/spark-submit \
- --master yarn
- --deploy-mode cluster
+ --master yarn \
+ --deploy-mode cluster \
+ --conf spark.rapids.sql.incompatibleDateFormats.enabled=true \
+ --conf spark.rapids.sql.csv.read.double.enabled=true \
+ --conf spark.sql.cache.serializer=com.nvidia.spark.ParquetCachedBatchSerializer \
+ --conf spark.rapids.sql.hasNans=false \
--jars ${RAPIDS_JAR}\
${MAIN_PY} \
--mainClass='com.nvidia.spark.examples.mortgage.etl_main' \
--format=csv \
- --dataPath="perf::${DATA_PATH}/mortgage/data/mortgage/perf/" \
- --dataPath="acq::${DATA_PATH}/mortgage/data/mortgage/acq/" \
- --dataPath="out::${DATA_PATH}/mortgage/data/mortgage/out/train/"
+ --dataPath="data::${DATA_PATH}/mortgage/data/mortgage/input/" \
+ --dataPath="out::${DATA_PATH}/mortgage/data/mortgage/output/train/"
-# if generate eval data, change the data path to eval
-# --dataPath="out::${DATA_PATH}/mortgage/data/mortgage/out/eval/
+# if generating eval data, change the data path to eval
+# --dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/"
+# --dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/eval/"
# if running Taxi ETL benchmark, change the class and data path params to
# -class com.nvidia.spark.examples.taxi.ETLMain
# -dataPath="raw::${SPARK_XGBOOST_DIR}/taxi/your-path"
@@ -194,8 +197,8 @@ ${SPARK_HOME}/bin/spark-submit
--py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP} \
${MAIN_PY} \
--mainClass=${EXAMPLE_CLASS} \
- --dataPath=train::${DATA_PATH}/mortgage/out/train/ \
- --dataPath=trans::${DATA_PATH}/mortgage/out/eval/ \
+ --dataPath=train::${DATA_PATH}/mortgage/output/train/ \
+ --dataPath=trans::${DATA_PATH}/mortgage/output/eval/ \
--format=parquet \
--numWorkers=${SPARK_NUM_EXECUTORS} \
--treeMethod=${TREE_METHOD} \
diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/yarn-scala.md b/docs/get-started/xgboost-examples/on-prem-cluster/yarn-scala.md
index 9e6e4367b..052d62ec2 100644
--- a/docs/get-started/xgboost-examples/on-prem-cluster/yarn-scala.md
+++ b/docs/get-started/xgboost-examples/on-prem-cluster/yarn-scala.md
@@ -35,6 +35,11 @@ Get Jars and Dataset
Make sure you have prepared the necessary packages and dataset by following this [guide](/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md)
+#### Note:
+1. Mortgage and Taxi jobs have ETLs to generate the processed data.
+2. For convenience, a subset of [Taxi](/datasets/) dataset is made available in this repo that can be readily used for launching XGBoost job. Use [ETL](#etl) to generate larger datasets for trainig and testing.
+3. Agaricus does not have an ETL process, it is combined with XGBoost as there is just a filter operation.
+
Create a directory in HDFS, and copy:
``` bash
@@ -45,19 +50,20 @@ Create a directory in HDFS, and copy:
Launch Mortgage or Taxi ETL Part
---------------------------
-Note: the `mortgage_eval_merged.csv` and `mortgage_train_merged.csv` are not Mortgage raw data,
-they are the data produced by Mortgage ETL job. If user wants to use a larger size Mortgage data, please refer to [Launch ETL job](#etl).
-Taxi ETL job is the same. But Agaricus does not have ETL process, it is combined with XGBoost as there is just a filter operation.
+Use the ETL app to process raw Mortgage data. You can either use this ETLed data to split into training and evaluation data or run the ETL on different subsets of the dataset to produce training and evaluation datasets.
Run spark-submit
``` bash
${SPARK_HOME}/bin/spark-submit \
--conf spark.plugins=com.nvidia.spark.SQLPlugin \
- --conf spark.rapids.memory.gpu.pooling.enabled=false \
--conf spark.executor.resource.gpu.amount=1 \
--conf spark.task.resource.gpu.amount=1 \
+ --conf spark.rapids.sql.incompatibleDateFormats.enabled=true \
+ --conf spark.rapids.sql.csv.read.double.enabled=true \
--conf spark.executor.resource.gpu.discoveryScript=./getGpusResources.sh \
+ --conf spark.sql.cache.serializer=com.nvidia.spark.ParquetCachedBatchSerializer \
+ --conf spark.rapids.sql.hasNans=false \
--files $SPARK_HOME/examples/src/main/scripts/getGpusResources.sh \
--jars ${RAPIDS_JAR} \
--master yarn \
@@ -65,18 +71,15 @@ ${SPARK_HOME}/bin/spark-submit \
--num-executors ${SPARK_NUM_EXECUTORS} \
--driver-memory ${SPARK_DRIVER_MEMORY} \
--executor-memory ${SPARK_EXECUTOR_MEMORY} \
- --class ${EXAMPLE_CLASS} \
--class com.nvidia.spark.examples.mortgage.ETLMain \
$SAMPLE_JAR \
-format=csv \
- -dataPath="perf::${SPARK_XGBOOST_DIR}/mortgage/perf-train/" \
- -dataPath="acq::${SPARK_XGBOOST_DIR}/mortgage/acq-train/" \
- -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/out/train/"
-
-# if generating eval data, change the data path to eval as well as the corresponding perf-eval and acq-eval data
-# -dataPath="perf::${SPARK_XGBOOST_DIR}/mortgage/perf-eval"
-# -dataPath="acq::${SPARK_XGBOOST_DIR}/mortgage/acq-eval"
-# -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/out/eval/"
+ -dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/" \
+ -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/train/"
+
+# if generating eval data, change the data path to eval
+# -dataPath="data::${SPARK_XGBOOST_DIR}/mortgage/input/"
+# -dataPath="out::${SPARK_XGBOOST_DIR}/mortgage/output/eval/"
# if running Taxi ETL benchmark, change the class and data path params to
# -class com.nvidia.spark.examples.taxi.ETLMain
# -dataPath="raw::${SPARK_XGBOOST_DIR}/taxi/your-path"
@@ -106,9 +109,9 @@ export SPARK_DRIVER_MEMORY=4g
export SPARK_EXECUTOR_MEMORY=8g
# example class to use
-export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.GPUMain
-# or change to com.nvidia.spark.examples.taxi.GPUMain to run Taxi Xgboost benchmark
-# or change to com.nvidia.spark.examples.agaricus.GPUMain to run Agaricus Xgboost benchmark
+export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.Main
+# or change to com.nvidia.spark.examples.taxi.Main to run Taxi Xgboost benchmark
+# or change to com.nvidia.spark.examples.agaricus.Main to run Agaricus Xgboost benchmark
# tree construction algorithm
export TREE_METHOD=gpu_hist
@@ -132,9 +135,9 @@ ${SPARK_HOME}/bin/spark-submit
--executor-memory ${SPARK_EXECUTOR_MEMORY} \
--class ${EXAMPLE_CLASS} \
${SAMPLE_JAR} \
- -dataPath=train::${DATA_PATH}/mortgage/csv/train/mortgage_train_merged.csv \
- -dataPath=trans::${DATA_PATH}/mortgage/csv/test/mortgage_eval_merged.csv \
- -format=csv \
+ -dataPath=train::${SPARK_XGBOOST_DIR}/mortgage/output/train/ \
+ -dataPath=trans::${SPARK_XGBOOST_DIR}/mortgage/output/eval/ \
+ -format=parquet \
-numWorkers=${SPARK_NUM_EXECUTORS} \
-treeMethod=${TREE_METHOD} \
-numRound=100 \
@@ -181,7 +184,7 @@ export SPARK_DRIVER_MEMORY=4g
export SPARK_EXECUTOR_MEMORY=8g
# example class to use
-export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.CPUMain
+export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.Main
# Please make sure to change the class while running Taxi or Agaricus benchmark
# tree construction algorithm
@@ -199,9 +202,9 @@ ${SPARK_HOME}/bin/spark-submit
--executor-memory ${SPARK_EXECUTOR_MEMORY} \
--class ${EXAMPLE_CLASS} \
${SAMPLE_JAR} \
- -dataPath=train::${DATA_PATH}/mortgage/csv/train/mortgage_train_merged.csv \
- -dataPath=trans::${DATA_PATH}/mortgage/csv/test/mortgage_eval_merged.csv \
- -format=csv \
+ -dataPath=train::${SPARK_XGBOOST_DIR}/mortgage/output/train/ \
+ -dataPath=trans::${SPARK_XGBOOST_DIR}/mortgage/output/eval/ \
+ -format=parquet \
-numWorkers=${SPARK_NUM_EXECUTORS} \
-treeMethod=${TREE_METHOD} \
-numRound=100 \
diff --git a/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md b/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md
index cbeeccdbb..48558b409 100644
--- a/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md
+++ b/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md
@@ -17,9 +17,10 @@ Following this [guide](/docs/get-started/xgboost-examples/building-sample-apps/p
### Download dataset
-You need to download Mortgage dataset to `/opt/xgboost` from this [site](https://docs.rapids.ai/datasets/mortgage-data)
-, download Taxi dataset from this [site](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
-, download Agaricus dataset from this [site](https://gust.dev/r/xgboost-agaricus).
+You need to copy the dataset to `/opt/xgboost`. Use the following links to download the data.
+1. [Mortgage dataset](/docs/get-started/xgboost-examples/dataset/mortgage.md)
+2. [Taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
+3. [Agaricus dataset](https://gust.dev/r/xgboost-agaricus)
### Setup environments
diff --git a/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md b/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md
index a5f451778..e2cd6daa5 100644
--- a/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md
+++ b/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md
@@ -13,9 +13,10 @@ Following this [guide](/docs/get-started/xgboost-examples/building-sample-apps/s
### Download dataset
-You need to download mortgage dataset to `/opt/xgboost` from this [site](https://docs.rapids.ai/datasets/mortgage-data)
-, download Taxi dataset from this [site](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
-, download Agaricus dataset from this [site](https://gust.dev/r/xgboost-agaricus).
+You need to copy the dataset to `/opt/xgboost`. Use the following links to download the data.
+1. [Mortgage dataset](/docs/get-started/xgboost-examples/dataset/mortgage.md)
+2. [Taxi dataset](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
+3. [Agaricus dataset](https://gust.dev/r/xgboost-agaricus)
### Setup environments
diff --git a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb
index a544f5795..7b29bed7b 100644
--- a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb
+++ b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb
@@ -49,9 +49,9 @@
"outputs": [],
"source": [
"# The input path of dataset\n",
- "dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n",
- "orig_perf_path=dataRoot + \"/mortgage/Performance/\"\n",
- "orig_acq_path=dataRoot + \"/mortgage/Acquisition/\""
+ "# dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n",
+ "dataRoot = os.getenv(\"DATA_ROOT\", \"/mortgage\")\n",
+ "orig_raw_path = dataRoot + \"/input/\""
]
},
{
@@ -72,15 +72,13 @@
"metadata": {},
"outputs": [],
"source": [
- "# Set True to save the dataset after ETL\n",
+ "# Set True to save processed dataset after ETL\n",
"# Set False, the dataset after ETL will be directly used in XGBoost train and transform\n",
+ "\n",
"is_save_dataset=True\n",
- "# the path to save the train dataset\n",
- "output_path_train=dataRoot + \"/mortgage/output/train/\"\n",
- "# the path to save the test dataset\n",
- "output_path_test=dataRoot + \"/mortgage/output/test/\"\n",
+ "output_path_data=dataRoot + \"/output/data/\"\n",
"# the path to save the xgboost model\n",
- "output_path_model=dataRoot + \"/mortgage/new-model-path\""
+ "output_path_model=dataRoot + \"/output/model/\""
]
},
{
@@ -97,65 +95,117 @@
"outputs": [],
"source": [
"# File schema\n",
- "_csv_perf_schema = StructType([\n",
- " StructField(\"loan_id\", LongType()),\n",
- " StructField(\"monthly_reporting_period\", StringType()),\n",
- " StructField(\"servicer\", StringType()),\n",
- " StructField(\"interest_rate\", DoubleType()),\n",
- " StructField(\"current_actual_upb\", DoubleType()),\n",
- " StructField(\"loan_age\", DoubleType()),\n",
- " StructField(\"remaining_months_to_legal_maturity\", DoubleType()),\n",
- " StructField(\"adj_remaining_months_to_maturity\", DoubleType()),\n",
- " StructField(\"maturity_date\", StringType()),\n",
- " StructField(\"msa\", DoubleType()),\n",
- " StructField(\"current_loan_delinquency_status\", IntegerType()),\n",
- " StructField(\"mod_flag\", StringType()),\n",
- " StructField(\"zero_balance_code\", StringType()),\n",
- " StructField(\"zero_balance_effective_date\", StringType()),\n",
- " StructField(\"last_paid_installment_date\", StringType()),\n",
- " StructField(\"foreclosed_after\", StringType()),\n",
- " StructField(\"disposition_date\", StringType()),\n",
- " StructField(\"foreclosure_costs\", DoubleType()),\n",
- " StructField(\"prop_preservation_and_repair_costs\", DoubleType()),\n",
- " StructField(\"asset_recovery_costs\", DoubleType()),\n",
- " StructField(\"misc_holding_expenses\", DoubleType()),\n",
- " StructField(\"holding_taxes\", DoubleType()),\n",
- " StructField(\"net_sale_proceeds\", DoubleType()),\n",
- " StructField(\"credit_enhancement_proceeds\", DoubleType()),\n",
- " StructField(\"repurchase_make_whole_proceeds\", StringType()),\n",
- " StructField(\"other_foreclosure_proceeds\", DoubleType()),\n",
- " StructField(\"non_interest_bearing_upb\", DoubleType()),\n",
- " StructField(\"principal_forgiveness_upb\", StringType()),\n",
- " StructField(\"repurchase_make_whole_proceeds_flag\", StringType()),\n",
- " StructField(\"foreclosure_principal_write_off_amount\", StringType()),\n",
- " StructField(\"servicing_activity_indicator\", StringType())])\n",
"\n",
- "_csv_acq_schema = StructType([\n",
- " StructField(\"loan_id\", LongType()),\n",
- " StructField(\"orig_channel\", StringType()),\n",
- " StructField(\"seller_name\", StringType()),\n",
- " StructField(\"orig_interest_rate\", DoubleType()),\n",
- " StructField(\"orig_upb\", IntegerType()),\n",
- " StructField(\"orig_loan_term\", IntegerType()),\n",
- " StructField(\"orig_date\", StringType()),\n",
- " StructField(\"first_pay_date\", StringType()),\n",
- " StructField(\"orig_ltv\", DoubleType()),\n",
- " StructField(\"orig_cltv\", DoubleType()),\n",
- " StructField(\"num_borrowers\", DoubleType()),\n",
- " StructField(\"dti\", DoubleType()),\n",
- " StructField(\"borrower_credit_score\", DoubleType()),\n",
- " StructField(\"first_home_buyer\", StringType()),\n",
- " StructField(\"loan_purpose\", StringType()),\n",
- " StructField(\"property_type\", StringType()),\n",
- " StructField(\"num_units\", IntegerType()),\n",
- " StructField(\"occupancy_status\", StringType()),\n",
- " StructField(\"property_state\", StringType()),\n",
- " StructField(\"zip\", IntegerType()),\n",
- " StructField(\"mortgage_insurance_percent\", DoubleType()),\n",
- " StructField(\"product_type\", StringType()),\n",
- " StructField(\"coborrow_credit_score\", DoubleType()),\n",
- " StructField(\"mortgage_insurance_type\", DoubleType()),\n",
- " StructField(\"relocation_mortgage_indicator\", StringType())])"
+ "_csv_raw_schema = StructType([\n",
+ " StructField(\"reference_pool_id\", StringType()),\n",
+ " StructField(\"loan_id\", LongType()),\n",
+ " StructField(\"monthly_reporting_period\", StringType()),\n",
+ " StructField(\"orig_channel\", StringType()),\n",
+ " StructField(\"seller_name\", StringType()),\n",
+ " StructField(\"servicer\", StringType()),\n",
+ " StructField(\"master_servicer\", StringType()),\n",
+ " StructField(\"orig_interest_rate\", DoubleType()),\n",
+ " StructField(\"interest_rate\", DoubleType()),\n",
+ " StructField(\"orig_upb\", DoubleType()),\n",
+ " StructField(\"upb_at_issuance\", StringType()),\n",
+ " StructField(\"current_actual_upb\", DoubleType()),\n",
+ " StructField(\"orig_loan_term\", IntegerType()),\n",
+ " StructField(\"orig_date\", StringType()),\n",
+ " StructField(\"first_pay_date\", StringType()), \n",
+ " StructField(\"loan_age\", DoubleType()),\n",
+ " StructField(\"remaining_months_to_legal_maturity\", DoubleType()),\n",
+ " StructField(\"adj_remaining_months_to_maturity\", DoubleType()),\n",
+ " StructField(\"maturity_date\", StringType()),\n",
+ " StructField(\"orig_ltv\", DoubleType()),\n",
+ " StructField(\"orig_cltv\", DoubleType()),\n",
+ " StructField(\"num_borrowers\", DoubleType()),\n",
+ " StructField(\"dti\", DoubleType()),\n",
+ " StructField(\"borrower_credit_score\", DoubleType()),\n",
+ " StructField(\"coborrow_credit_score\", DoubleType()),\n",
+ " StructField(\"first_home_buyer\", StringType()),\n",
+ " StructField(\"loan_purpose\", StringType()),\n",
+ " StructField(\"property_type\", StringType()),\n",
+ " StructField(\"num_units\", IntegerType()),\n",
+ " StructField(\"occupancy_status\", StringType()),\n",
+ " StructField(\"property_state\", StringType()),\n",
+ " StructField(\"msa\", DoubleType()),\n",
+ " StructField(\"zip\", IntegerType()),\n",
+ " StructField(\"mortgage_insurance_percent\", DoubleType()),\n",
+ " StructField(\"product_type\", StringType()),\n",
+ " StructField(\"prepayment_penalty_indicator\", StringType()),\n",
+ " StructField(\"interest_only_loan_indicator\", StringType()),\n",
+ " StructField(\"interest_only_first_principal_and_interest_payment_date\", StringType()),\n",
+ " StructField(\"months_to_amortization\", StringType()),\n",
+ " StructField(\"current_loan_delinquency_status\", IntegerType()),\n",
+ " StructField(\"loan_payment_history\", StringType()),\n",
+ " StructField(\"mod_flag\", StringType()),\n",
+ " StructField(\"mortgage_insurance_cancellation_indicator\", StringType()),\n",
+ " StructField(\"zero_balance_code\", StringType()),\n",
+ " StructField(\"zero_balance_effective_date\", StringType()),\n",
+ " StructField(\"upb_at_the_time_of_removal\", StringType()),\n",
+ " StructField(\"repurchase_date\", StringType()),\n",
+ " StructField(\"scheduled_principal_current\", StringType()),\n",
+ " StructField(\"total_principal_current\", StringType()),\n",
+ " StructField(\"unscheduled_principal_current\", StringType()),\n",
+ " StructField(\"last_paid_installment_date\", StringType()),\n",
+ " StructField(\"foreclosed_after\", StringType()),\n",
+ " StructField(\"disposition_date\", StringType()),\n",
+ " StructField(\"foreclosure_costs\", DoubleType()),\n",
+ " StructField(\"prop_preservation_and_repair_costs\", DoubleType()),\n",
+ " StructField(\"asset_recovery_costs\", DoubleType()),\n",
+ " StructField(\"misc_holding_expenses\", DoubleType()),\n",
+ " StructField(\"holding_taxes\", DoubleType()),\n",
+ " StructField(\"net_sale_proceeds\", DoubleType()),\n",
+ " StructField(\"credit_enhancement_proceeds\", DoubleType()),\n",
+ " StructField(\"repurchase_make_whole_proceeds\", StringType()),\n",
+ " StructField(\"other_foreclosure_proceeds\", DoubleType()),\n",
+ " StructField(\"non_interest_bearing_upb\", DoubleType()),\n",
+ " StructField(\"principal_forgiveness_upb\", StringType()),\n",
+ " StructField(\"original_list_start_date\", StringType()),\n",
+ " StructField(\"original_list_price\", StringType()),\n",
+ " StructField(\"current_list_start_date\", StringType()),\n",
+ " StructField(\"current_list_price\", StringType()),\n",
+ " StructField(\"borrower_credit_score_at_issuance\", StringType()),\n",
+ " StructField(\"co-borrower_credit_score_at_issuance\", StringType()),\n",
+ " StructField(\"borrower_credit_score_current\", StringType()),\n",
+ " StructField(\"co-Borrower_credit_score_current\", StringType()),\n",
+ " StructField(\"mortgage_insurance_type\", DoubleType()),\n",
+ " StructField(\"servicing_activity_indicator\", StringType()),\n",
+ " StructField(\"current_period_modification_loss_amount\", StringType()),\n",
+ " StructField(\"cumulative_modification_loss_amount\", StringType()),\n",
+ " StructField(\"current_period_credit_event_net_gain_or_loss\", StringType()),\n",
+ " StructField(\"cumulative_credit_event_net_gain_or_loss\", StringType()),\n",
+ " StructField(\"homeready_program_indicator\", StringType()),\n",
+ " StructField(\"foreclosure_principal_write_off_amount\", StringType()),\n",
+ " StructField(\"relocation_mortgage_indicator\", StringType()),\n",
+ " StructField(\"zero_balance_code_change_date\", StringType()),\n",
+ " StructField(\"loan_holdback_indicator\", StringType()),\n",
+ " StructField(\"loan_holdback_effective_date\", StringType()),\n",
+ " StructField(\"delinquent_accrued_interest\", StringType()),\n",
+ " StructField(\"property_valuation_method\", StringType()),\n",
+ " StructField(\"high_balance_loan_indicator\", StringType()),\n",
+ " StructField(\"arm_initial_fixed-rate_period_lt_5_yr_indicator\", StringType()),\n",
+ " StructField(\"arm_product_type\", StringType()),\n",
+ " StructField(\"initial_fixed-rate_period\", StringType()),\n",
+ " StructField(\"interest_rate_adjustment_frequency\", StringType()),\n",
+ " StructField(\"next_interest_rate_adjustment_date\", StringType()),\n",
+ " StructField(\"next_payment_change_date\", StringType()),\n",
+ " StructField(\"index\", StringType()),\n",
+ " StructField(\"arm_cap_structure\", StringType()),\n",
+ " StructField(\"initial_interest_rate_cap_up_percent\", StringType()),\n",
+ " StructField(\"periodic_interest_rate_cap_up_percent\", StringType()),\n",
+ " StructField(\"lifetime_interest_rate_cap_up_percent\", StringType()),\n",
+ " StructField(\"mortgage_margin\", StringType()),\n",
+ " StructField(\"arm_balloon_indicator\", StringType()),\n",
+ " StructField(\"arm_plan_number\", StringType()),\n",
+ " StructField(\"borrower_assistance_plan\", StringType()),\n",
+ " StructField(\"hltv_refinance_option_indicator\", StringType()),\n",
+ " StructField(\"deal_name\", StringType()),\n",
+ " StructField(\"repurchase_make_whole_proceeds_flag\", StringType()),\n",
+ " StructField(\"alternative_delinquency_resolution\", StringType()),\n",
+ " StructField(\"alternative_delinquency_resolution_count\", StringType()),\n",
+ " StructField(\"total_deferral_amount\", StringType())\n",
+ " ])"
]
},
{
@@ -312,14 +362,14 @@
"outputs": [],
"source": [
"def _get_quarter_from_csv_file_name():\n",
- " return substring_index(substring_index(input_file_name(), \".\", 1), \"_\", -1)"
+ " return substring_index(substring_index(input_file_name(), \".\", 1), \"/\", -1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "* Define function to read Performance CSV data file"
+ "* Define function to read raw CSV data file"
]
},
{
@@ -328,37 +378,98 @@
"metadata": {},
"outputs": [],
"source": [
- "def read_perf_csv(spark, path):\n",
- " return spark.read.format(\"csv\") \\\n",
- " .option(\"nullValue\", \"\") \\\n",
- " .option(\"header\", \"false\") \\\n",
- " .option(\"delimiter\", \"|\") \\\n",
- " .schema(_csv_perf_schema) \\\n",
+ "def read_raw_csv(spark, path):\n",
+ " return spark.read.format('csv') \\\n",
+ " .option('nullValue', '') \\\n",
+ " .option('header', False) \\\n",
+ " .option('delimiter', '|') \\\n",
+ " .schema(_csv_raw_schema) \\\n",
" .load(path) \\\n",
- " .withColumn(\"quarter\", _get_quarter_from_csv_file_name())"
+ " .withColumn('quarter', _get_quarter_from_csv_file_name())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "* Define function to read Acquisition CSV file"
+ "* Functions to extract perf and acq columns from raw schema"
]
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
- "def read_acq_csv(spark, path):\n",
- " return spark.read.format(\"csv\") \\\n",
- " .option(\"nullValue\", \"\") \\\n",
- " .option(\"header\", \"false\") \\\n",
- " .option(\"delimiter\", \"|\") \\\n",
- " .schema(_csv_acq_schema) \\\n",
- " .load(path) \\\n",
- " .withColumn(\"quarter\", _get_quarter_from_csv_file_name())"
+ "def extract_perf_columns(rawDf):\n",
+ " perfDf = rawDf.select(\n",
+ " col(\"loan_id\"),\n",
+ " date_format(to_date(col(\"monthly_reporting_period\"),\"MMyyyy\"), \"MM/dd/yyyy\").alias(\"monthly_reporting_period\"),\n",
+ " upper(col(\"servicer\")).alias(\"servicer\"),\n",
+ " col(\"interest_rate\"),\n",
+ " col(\"current_actual_upb\"),\n",
+ " col(\"loan_age\"),\n",
+ " col(\"remaining_months_to_legal_maturity\"),\n",
+ " col(\"adj_remaining_months_to_maturity\"),\n",
+ " date_format(to_date(col(\"maturity_date\"),\"MMyyyy\"), \"MM/yyyy\").alias(\"maturity_date\"),\n",
+ " col(\"msa\"),\n",
+ " col(\"current_loan_delinquency_status\"),\n",
+ " col(\"mod_flag\"),\n",
+ " col(\"zero_balance_code\"),\n",
+ " date_format(to_date(col(\"zero_balance_effective_date\"),\"MMyyyy\"), \"MM/yyyy\").alias(\"zero_balance_effective_date\"),\n",
+ " date_format(to_date(col(\"last_paid_installment_date\"),\"MMyyyy\"), \"MM/dd/yyyy\").alias(\"last_paid_installment_date\"),\n",
+ " date_format(to_date(col(\"foreclosed_after\"),\"MMyyyy\"), \"MM/dd/yyyy\").alias(\"foreclosed_after\"),\n",
+ " date_format(to_date(col(\"disposition_date\"),\"MMyyyy\"), \"MM/dd/yyyy\").alias(\"disposition_date\"),\n",
+ " col(\"foreclosure_costs\"),\n",
+ " col(\"prop_preservation_and_repair_costs\"),\n",
+ " col(\"asset_recovery_costs\"),\n",
+ " col(\"misc_holding_expenses\"),\n",
+ " col(\"holding_taxes\"),\n",
+ " col(\"net_sale_proceeds\"),\n",
+ " col(\"credit_enhancement_proceeds\"),\n",
+ " col(\"repurchase_make_whole_proceeds\"),\n",
+ " col(\"other_foreclosure_proceeds\"),\n",
+ " col(\"non_interest_bearing_upb\"),\n",
+ " col(\"principal_forgiveness_upb\"),\n",
+ " col(\"repurchase_make_whole_proceeds_flag\"),\n",
+ " col(\"foreclosure_principal_write_off_amount\"),\n",
+ " col(\"servicing_activity_indicator\"),\n",
+ " col('quarter')\n",
+ " )\n",
+ " return perfDf.select(\"*\").filter(\"current_actual_upb != 0.0\")\n",
+ "\n",
+ "def extract_acq_columns(rawDf):\n",
+ " acqDf = rawDf.select(\n",
+ " col(\"loan_id\"),\n",
+ " col(\"orig_channel\"),\n",
+ " upper(col(\"seller_name\")).alias(\"seller_name\"),\n",
+ " col(\"orig_interest_rate\"),\n",
+ " col(\"orig_upb\"),\n",
+ " col(\"orig_loan_term\"),\n",
+ " date_format(to_date(col(\"orig_date\"),\"MMyyyy\"), \"MM/yyyy\").alias(\"orig_date\"),\n",
+ " date_format(to_date(col(\"first_pay_date\"),\"MMyyyy\"), \"MM/yyyy\").alias(\"first_pay_date\"),\n",
+ " col(\"orig_ltv\"),\n",
+ " col(\"orig_cltv\"),\n",
+ " col(\"num_borrowers\"),\n",
+ " col(\"dti\"),\n",
+ " col(\"borrower_credit_score\"),\n",
+ " col(\"first_home_buyer\"),\n",
+ " col(\"loan_purpose\"),\n",
+ " col(\"property_type\"),\n",
+ " col(\"num_units\"),\n",
+ " col(\"occupancy_status\"),\n",
+ " col(\"property_state\"),\n",
+ " col(\"zip\"),\n",
+ " col(\"mortgage_insurance_percent\"),\n",
+ " col(\"product_type\"),\n",
+ " col(\"coborrow_credit_score\"),\n",
+ " col(\"mortgage_insurance_type\"),\n",
+ " col(\"relocation_mortgage_indicator\"),\n",
+ " dense_rank().over(Window.partitionBy(\"loan_id\").orderBy(to_date(col(\"monthly_reporting_period\"),\"MMyyyy\"))).alias(\"rank\"),\n",
+ " col('quarter')\n",
+ " )\n",
+ "\n",
+ " return acqDf.select(\"*\").filter(col(\"rank\")==1)"
]
},
{
@@ -372,7 +483,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
@@ -398,7 +509,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@@ -477,7 +588,7 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
@@ -524,7 +635,7 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
@@ -548,7 +659,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
@@ -583,7 +694,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
@@ -610,7 +721,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
@@ -619,12 +730,9 @@
"# CPU run, set to false, it can only make ETL run on CPU when is_save_dataset=True.\n",
"# spark.conf.set(\"spark.rapids.sql.enabled\", \"false\")\n",
"spark.conf.set(\"spark.sql.files.maxPartitionBytes\", \"1G\")\n",
- "spark.conf.set(\"spark.sql.shuffle.partitions\", \"192\")\n",
"spark.conf.set(\"spark.rapids.sql.explain\", \"ALL\")\n",
- "spark.conf.set(\"spark.rapids.sql.incompatibleOps.enabled\", \"true\")\n",
"spark.conf.set(\"spark.rapids.sql.batchSizeBytes\", \"512M\")\n",
"spark.conf.set(\"spark.rapids.sql.reader.batchSizeBytes\", \"768M\")\n",
- "spark.conf.set(\"spark.rapids.sql.incompatibleDateFormats.enabled\", \"true\")\n",
"spark.conf.set(\"spark.rapids.sql.hasNans\", \"false\")\n",
"# use GPU to read CSV\n",
"spark.conf.set(\"spark.rapids.sql.csv.read.double.enabled\", \"true\")"
@@ -639,7 +747,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 20,
"metadata": {
"scrolled": false
},
@@ -648,27 +756,24 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "ETL takes 41.10439682006836\n"
+ "ETL takes 135.9117729663849\n"
]
}
],
"source": [
"\n",
"# read raw dataset\n",
- "perf = read_perf_csv(spark, orig_perf_path)\n",
- "acq = read_acq_csv(spark, orig_acq_path)\n",
+ "rawDf = read_raw_csv(spark, orig_raw_path)\n",
+ "acq = extract_acq_columns(rawDf)\n",
+ "perf = extract_perf_columns(rawDf)\n",
"\n",
"# run main function to process data\n",
"out = run_mortgage(spark, perf, acq)\n",
"\n",
- "# split 80% for training, 20% for test\n",
- "splits = out.randomSplit([0.8, 0.2])\n",
- "\n",
"# save processed data\n",
"if is_save_dataset:\n",
" start = time.time()\n",
- " splits[0].write.parquet(output_path_train, mode=\"overwrite\")\n",
- " splits[1].write.parquet(output_path_test, mode=\"overwrite\")\n",
+ " out.write.parquet(output_path_data, mode=\"overwrite\")\n",
" end = time.time()\n",
" print(\"ETL takes {}\".format(end - start))"
]
@@ -689,7 +794,7 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
@@ -706,7 +811,7 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
@@ -725,7 +830,7 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
@@ -742,7 +847,7 @@
" StructField(\"seller_name\", FloatType()),\n",
" StructField(\"mod_flag\", FloatType()),\n",
" StructField(\"orig_interest_rate\", FloatType()),\n",
- " StructField(\"orig_upb\", IntegerType()),\n",
+ " StructField(\"orig_upb\", DoubleType()),\n",
" StructField(\"orig_loan_term\", IntegerType()),\n",
" StructField(\"orig_ltv\", FloatType()),\n",
" StructField(\"orig_cltv\", FloatType()),\n",
@@ -764,17 +869,20 @@
"\n",
"if is_save_dataset:\n",
" # load dataset from file\n",
- " train_data = reader.parquet(output_path_train)\n",
- " test_data = reader.parquet(output_path_test)\n",
+ " etlDf = reader.parquet(output_path_data)\n",
+ " splits = etlDf.randomSplit([0.8, 0.2])\n",
+ " train_data = splits[0]\n",
+ " test_data = splits[1]\n",
"else:\n",
" # use Dataframe from ETL directly\n",
+ " splits = out.randomSplit([0.8, 0.2])\n",
" train_data = splits[0]\n",
" test_data = splits[1]"
]
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
@@ -785,21 +893,21 @@
" \"growPolicy\": \"depthwise\",\n",
" \"nthread\": 1,\n",
" \"numRound\": 100,\n",
- " \"numWorkers\": 2,\n",
+ " \"numWorkers\": 1,\n",
"}\n",
"classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCols(features)"
]
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Training takes 23.666603565216064 seconds\n"
+ "Training takes 18.92583155632019 seconds\n"
]
}
],
@@ -815,7 +923,7 @@
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
@@ -825,22 +933,22 @@
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Transformation takes 10.464573383331299 seconds\n",
+ "Transformation takes 8.959877967834473 seconds\n",
"+--------------+--------------------+--------------------+----------+\n",
"|delinquency_12| rawPrediction| probability|prediction|\n",
"+--------------+--------------------+--------------------+----------+\n",
- "| 0|[11.3724613189697...|[0.99998849205439...| 0.0|\n",
- "| 0|[8.75509834289550...|[0.99984236936143...| 0.0|\n",
- "| 0|[8.56840324401855...|[0.99981002029380...| 0.0|\n",
- "| 0|[8.45872020721435...|[0.99978800168901...| 0.0|\n",
- "| 0|[8.45872020721435...|[0.99978800168901...| 0.0|\n",
+ "| 0|[7.92072248458862...|[0.99963699193904...| 0.0|\n",
+ "| 0|[7.92072248458862...|[0.99963699193904...| 0.0|\n",
+ "| 0|[8.43130302429199...|[0.99978211015695...| 0.0|\n",
+ "| 0|[8.20779895782470...|[0.99972755435737...| 0.0|\n",
+ "| 0|[8.885986328125,-...|[0.99986170543706...| 0.0|\n",
"+--------------+--------------------+--------------------+----------+\n",
"only showing top 5 rows\n",
"\n"
@@ -858,15 +966,15 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Evaluation takes 0.770418643951416 seconds\n",
- "Accuracy is 0.9881320119084719\n"
+ "Evaluation takes 0.6158628463745117 seconds\n",
+ "Accuracy is 0.9861453808970397\n"
]
}
],
@@ -879,7 +987,7 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
@@ -903,7 +1011,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.2"
+ "version": "3.6.9"
},
"name": "gpu-mortgage",
"notebookId": 4440374682851873
diff --git a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb
index d36474176..c029770e1 100644
--- a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb
+++ b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb
@@ -6,7 +6,8 @@
"source": [
"## Prerequirement\n",
"### 1. Download data\n",
- "All data could be found at https://docs.rapids.ai/datasets/mortgage-data\n",
+ "\n",
+ "Download Single-Family Loan Performance Data from [Fannie Mae](https://datadynamics.fanniemae.com/data-dynamics/#/reportMenu;category=HP) website. \n",
"\n",
"### 2. Download needed jars\n",
"* [rapids-4-spark_2.12-22.06.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.06.0/rapids-4-spark_2.12-22.06.0.jar)\n",
@@ -38,7 +39,7 @@
},
{
"cell_type": "code",
- "execution_count": 42,
+ "execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
@@ -60,7 +61,7 @@
},
{
"cell_type": "code",
- "execution_count": 43,
+ "execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
@@ -82,70 +83,121 @@
},
{
"cell_type": "code",
- "execution_count": 44,
+ "execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"# File schema\n",
- "_csv_perf_schema = StructType([\n",
- " StructField('loan_id', LongType()),\n",
- " StructField('monthly_reporting_period', StringType()),\n",
- " StructField('servicer', StringType()),\n",
- " StructField('interest_rate', DoubleType()),\n",
- " StructField('current_actual_upb', DoubleType()),\n",
- " StructField('loan_age', DoubleType()),\n",
- " StructField('remaining_months_to_legal_maturity', DoubleType()),\n",
- " StructField('adj_remaining_months_to_maturity', DoubleType()),\n",
- " StructField('maturity_date', StringType()),\n",
- " StructField('msa', DoubleType()),\n",
- " StructField('current_loan_delinquency_status', IntegerType()),\n",
- " StructField('mod_flag', StringType()),\n",
- " StructField('zero_balance_code', StringType()),\n",
- " StructField('zero_balance_effective_date', StringType()),\n",
- " StructField('last_paid_installment_date', StringType()),\n",
- " StructField('foreclosed_after', StringType()),\n",
- " StructField('disposition_date', StringType()),\n",
- " StructField('foreclosure_costs', DoubleType()),\n",
- " StructField('prop_preservation_and_repair_costs', DoubleType()),\n",
- " StructField('asset_recovery_costs', DoubleType()),\n",
- " StructField('misc_holding_expenses', DoubleType()),\n",
- " StructField('holding_taxes', DoubleType()),\n",
- " StructField('net_sale_proceeds', DoubleType()),\n",
- " StructField('credit_enhancement_proceeds', DoubleType()),\n",
- " StructField('repurchase_make_whole_proceeds', StringType()),\n",
- " StructField('other_foreclosure_proceeds', DoubleType()),\n",
- " StructField('non_interest_bearing_upb', DoubleType()),\n",
- " StructField('principal_forgiveness_upb', StringType()),\n",
- " StructField('repurchase_make_whole_proceeds_flag', StringType()),\n",
- " StructField('foreclosure_principal_write_off_amount', StringType()),\n",
- " StructField('servicing_activity_indicator', StringType())])\n",
- "\n",
- "_csv_acq_schema = StructType([\n",
- " StructField('loan_id', LongType()),\n",
- " StructField('orig_channel', StringType()),\n",
- " StructField('seller_name', StringType()),\n",
- " StructField('orig_interest_rate', DoubleType()),\n",
- " StructField('orig_upb', IntegerType()),\n",
- " StructField('orig_loan_term', IntegerType()),\n",
- " StructField('orig_date', StringType()),\n",
- " StructField('first_pay_date', StringType()),\n",
- " StructField('orig_ltv', DoubleType()),\n",
- " StructField('orig_cltv', DoubleType()),\n",
- " StructField('num_borrowers', DoubleType()),\n",
- " StructField('dti', DoubleType()),\n",
- " StructField('borrower_credit_score', DoubleType()),\n",
- " StructField('first_home_buyer', StringType()),\n",
- " StructField('loan_purpose', StringType()),\n",
- " StructField('property_type', StringType()),\n",
- " StructField('num_units', IntegerType()),\n",
- " StructField('occupancy_status', StringType()),\n",
- " StructField('property_state', StringType()),\n",
- " StructField('zip', IntegerType()),\n",
- " StructField('mortgage_insurance_percent', DoubleType()),\n",
- " StructField('product_type', StringType()),\n",
- " StructField('coborrow_credit_score', DoubleType()),\n",
- " StructField('mortgage_insurance_type', DoubleType()),\n",
- " StructField('relocation_mortgage_indicator', StringType())])"
+ "_csv_raw_schema = StructType([\n",
+ " StructField(\"reference_pool_id\", StringType()),\n",
+ " StructField(\"loan_id\", LongType()),\n",
+ " StructField(\"monthly_reporting_period\", StringType()),\n",
+ " StructField(\"orig_channel\", StringType()),\n",
+ " StructField(\"seller_name\", StringType()),\n",
+ " StructField(\"servicer\", StringType()),\n",
+ " StructField(\"master_servicer\", StringType()),\n",
+ " StructField(\"orig_interest_rate\", DoubleType()),\n",
+ " StructField(\"interest_rate\", DoubleType()),\n",
+ " StructField(\"orig_upb\", DoubleType()),\n",
+ " StructField(\"upb_at_issuance\", StringType()),\n",
+ " StructField(\"current_actual_upb\", DoubleType()),\n",
+ " StructField(\"orig_loan_term\", IntegerType()),\n",
+ " StructField(\"orig_date\", StringType()),\n",
+ " StructField(\"first_pay_date\", StringType()), \n",
+ " StructField(\"loan_age\", DoubleType()),\n",
+ " StructField(\"remaining_months_to_legal_maturity\", DoubleType()),\n",
+ " StructField(\"adj_remaining_months_to_maturity\", DoubleType()),\n",
+ " StructField(\"maturity_date\", StringType()),\n",
+ " StructField(\"orig_ltv\", DoubleType()),\n",
+ " StructField(\"orig_cltv\", DoubleType()),\n",
+ " StructField(\"num_borrowers\", DoubleType()),\n",
+ " StructField(\"dti\", DoubleType()),\n",
+ " StructField(\"borrower_credit_score\", DoubleType()),\n",
+ " StructField(\"coborrow_credit_score\", DoubleType()),\n",
+ " StructField(\"first_home_buyer\", StringType()),\n",
+ " StructField(\"loan_purpose\", StringType()),\n",
+ " StructField(\"property_type\", StringType()),\n",
+ " StructField(\"num_units\", IntegerType()),\n",
+ " StructField(\"occupancy_status\", StringType()),\n",
+ " StructField(\"property_state\", StringType()),\n",
+ " StructField(\"msa\", DoubleType()),\n",
+ " StructField(\"zip\", IntegerType()),\n",
+ " StructField(\"mortgage_insurance_percent\", DoubleType()),\n",
+ " StructField(\"product_type\", StringType()),\n",
+ " StructField(\"prepayment_penalty_indicator\", StringType()),\n",
+ " StructField(\"interest_only_loan_indicator\", StringType()),\n",
+ " StructField(\"interest_only_first_principal_and_interest_payment_date\", StringType()),\n",
+ " StructField(\"months_to_amortization\", StringType()),\n",
+ " StructField(\"current_loan_delinquency_status\", IntegerType()),\n",
+ " StructField(\"loan_payment_history\", StringType()),\n",
+ " StructField(\"mod_flag\", StringType()),\n",
+ " StructField(\"mortgage_insurance_cancellation_indicator\", StringType()),\n",
+ " StructField(\"zero_balance_code\", StringType()),\n",
+ " StructField(\"zero_balance_effective_date\", StringType()),\n",
+ " StructField(\"upb_at_the_time_of_removal\", StringType()),\n",
+ " StructField(\"repurchase_date\", StringType()),\n",
+ " StructField(\"scheduled_principal_current\", StringType()),\n",
+ " StructField(\"total_principal_current\", StringType()),\n",
+ " StructField(\"unscheduled_principal_current\", StringType()),\n",
+ " StructField(\"last_paid_installment_date\", StringType()),\n",
+ " StructField(\"foreclosed_after\", StringType()),\n",
+ " StructField(\"disposition_date\", StringType()),\n",
+ " StructField(\"foreclosure_costs\", DoubleType()),\n",
+ " StructField(\"prop_preservation_and_repair_costs\", DoubleType()),\n",
+ " StructField(\"asset_recovery_costs\", DoubleType()),\n",
+ " StructField(\"misc_holding_expenses\", DoubleType()),\n",
+ " StructField(\"holding_taxes\", DoubleType()),\n",
+ " StructField(\"net_sale_proceeds\", DoubleType()),\n",
+ " StructField(\"credit_enhancement_proceeds\", DoubleType()),\n",
+ " StructField(\"repurchase_make_whole_proceeds\", StringType()),\n",
+ " StructField(\"other_foreclosure_proceeds\", DoubleType()),\n",
+ " StructField(\"non_interest_bearing_upb\", DoubleType()),\n",
+ " StructField(\"principal_forgiveness_upb\", StringType()),\n",
+ " StructField(\"original_list_start_date\", StringType()),\n",
+ " StructField(\"original_list_price\", StringType()),\n",
+ " StructField(\"current_list_start_date\", StringType()),\n",
+ " StructField(\"current_list_price\", StringType()),\n",
+ " StructField(\"borrower_credit_score_at_issuance\", StringType()),\n",
+ " StructField(\"co-borrower_credit_score_at_issuance\", StringType()),\n",
+ " StructField(\"borrower_credit_score_current\", StringType()),\n",
+ " StructField(\"co-Borrower_credit_score_current\", StringType()),\n",
+ " StructField(\"mortgage_insurance_type\", DoubleType()),\n",
+ " StructField(\"servicing_activity_indicator\", StringType()),\n",
+ " StructField(\"current_period_modification_loss_amount\", StringType()),\n",
+ " StructField(\"cumulative_modification_loss_amount\", StringType()),\n",
+ " StructField(\"current_period_credit_event_net_gain_or_loss\", StringType()),\n",
+ " StructField(\"cumulative_credit_event_net_gain_or_loss\", StringType()),\n",
+ " StructField(\"homeready_program_indicator\", StringType()),\n",
+ " StructField(\"foreclosure_principal_write_off_amount\", StringType()),\n",
+ " StructField(\"relocation_mortgage_indicator\", StringType()),\n",
+ " StructField(\"zero_balance_code_change_date\", StringType()),\n",
+ " StructField(\"loan_holdback_indicator\", StringType()),\n",
+ " StructField(\"loan_holdback_effective_date\", StringType()),\n",
+ " StructField(\"delinquent_accrued_interest\", StringType()),\n",
+ " StructField(\"property_valuation_method\", StringType()),\n",
+ " StructField(\"high_balance_loan_indicator\", StringType()),\n",
+ " StructField(\"arm_initial_fixed-rate_period_lt_5_yr_indicator\", StringType()),\n",
+ " StructField(\"arm_product_type\", StringType()),\n",
+ " StructField(\"initial_fixed-rate_period\", StringType()),\n",
+ " StructField(\"interest_rate_adjustment_frequency\", StringType()),\n",
+ " StructField(\"next_interest_rate_adjustment_date\", StringType()),\n",
+ " StructField(\"next_payment_change_date\", StringType()),\n",
+ " StructField(\"index\", StringType()),\n",
+ " StructField(\"arm_cap_structure\", StringType()),\n",
+ " StructField(\"initial_interest_rate_cap_up_percent\", StringType()),\n",
+ " StructField(\"periodic_interest_rate_cap_up_percent\", StringType()),\n",
+ " StructField(\"lifetime_interest_rate_cap_up_percent\", StringType()),\n",
+ " StructField(\"mortgage_margin\", StringType()),\n",
+ " StructField(\"arm_balloon_indicator\", StringType()),\n",
+ " StructField(\"arm_plan_number\", StringType()),\n",
+ " StructField(\"borrower_assistance_plan\", StringType()),\n",
+ " StructField(\"hltv_refinance_option_indicator\", StringType()),\n",
+ " StructField(\"deal_name\", StringType()),\n",
+ " StructField(\"repurchase_make_whole_proceeds_flag\", StringType()),\n",
+ " StructField(\"alternative_delinquency_resolution\", StringType()),\n",
+ " StructField(\"alternative_delinquency_resolution_count\", StringType()),\n",
+ " StructField(\"total_deferral_amount\", StringType())\n",
+ " ])"
]
},
{
@@ -157,7 +209,7 @@
},
{
"cell_type": "code",
- "execution_count": 45,
+ "execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
@@ -254,7 +306,7 @@
},
{
"cell_type": "code",
- "execution_count": 46,
+ "execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
@@ -300,67 +352,129 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "### 2. Define ETL Process\n",
- "\n",
- "Define the function to do the ETL process\n",
- "\n",
- "#### 2.1 Define Functions to Read Raw CSV File\n",
- "\n",
- "* Define function to get quarter from input CSV file name"
+ "* Functions to extract perf and acq columns from raw schema"
]
},
{
"cell_type": "code",
- "execution_count": 47,
+ "execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
- "def _get_quarter_from_csv_file_name():\n",
- " return substring_index(substring_index(input_file_name(), '.', 1), '_', -1)"
+ "def extract_perf_columns(rawDf):\n",
+ " perfDf = rawDf.select(\n",
+ " col(\"loan_id\"),\n",
+ " date_format(to_date(col(\"monthly_reporting_period\"),\"MMyyyy\"), \"MM/dd/yyyy\").alias(\"monthly_reporting_period\"),\n",
+ " upper(col(\"servicer\")).alias(\"servicer\"),\n",
+ " col(\"interest_rate\"),\n",
+ " col(\"current_actual_upb\"),\n",
+ " col(\"loan_age\"),\n",
+ " col(\"remaining_months_to_legal_maturity\"),\n",
+ " col(\"adj_remaining_months_to_maturity\"),\n",
+ " date_format(to_date(col(\"maturity_date\"),\"MMyyyy\"), \"MM/yyyy\").alias(\"maturity_date\"),\n",
+ " col(\"msa\"),\n",
+ " col(\"current_loan_delinquency_status\"),\n",
+ " col(\"mod_flag\"),\n",
+ " col(\"zero_balance_code\"),\n",
+ " date_format(to_date(col(\"zero_balance_effective_date\"),\"MMyyyy\"), \"MM/yyyy\").alias(\"zero_balance_effective_date\"),\n",
+ " date_format(to_date(col(\"last_paid_installment_date\"),\"MMyyyy\"), \"MM/dd/yyyy\").alias(\"last_paid_installment_date\"),\n",
+ " date_format(to_date(col(\"foreclosed_after\"),\"MMyyyy\"), \"MM/dd/yyyy\").alias(\"foreclosed_after\"),\n",
+ " date_format(to_date(col(\"disposition_date\"),\"MMyyyy\"), \"MM/dd/yyyy\").alias(\"disposition_date\"),\n",
+ " col(\"foreclosure_costs\"),\n",
+ " col(\"prop_preservation_and_repair_costs\"),\n",
+ " col(\"asset_recovery_costs\"),\n",
+ " col(\"misc_holding_expenses\"),\n",
+ " col(\"holding_taxes\"),\n",
+ " col(\"net_sale_proceeds\"),\n",
+ " col(\"credit_enhancement_proceeds\"),\n",
+ " col(\"repurchase_make_whole_proceeds\"),\n",
+ " col(\"other_foreclosure_proceeds\"),\n",
+ " col(\"non_interest_bearing_upb\"),\n",
+ " col(\"principal_forgiveness_upb\"),\n",
+ " col(\"repurchase_make_whole_proceeds_flag\"),\n",
+ " col(\"foreclosure_principal_write_off_amount\"),\n",
+ " col(\"servicing_activity_indicator\"),\n",
+ " col('quarter')\n",
+ " )\n",
+ "\n",
+ " return perfDf.select(\"*\").filter(\"current_actual_upb != 0.0\")\n",
+ "\n",
+ "def extract_acq_columns(rawDf):\n",
+ " acqDf = rawDf.select(\n",
+ " col(\"loan_id\"),\n",
+ " col(\"orig_channel\"),\n",
+ " upper(col(\"seller_name\")).alias(\"seller_name\"),\n",
+ " col(\"orig_interest_rate\"),\n",
+ " col(\"orig_upb\"),\n",
+ " col(\"orig_loan_term\"),\n",
+ " date_format(to_date(col(\"orig_date\"),\"MMyyyy\"), \"MM/yyyy\").alias(\"orig_date\"),\n",
+ " date_format(to_date(col(\"first_pay_date\"),\"MMyyyy\"), \"MM/yyyy\").alias(\"first_pay_date\"),\n",
+ " col(\"orig_ltv\"),\n",
+ " col(\"orig_cltv\"),\n",
+ " col(\"num_borrowers\"),\n",
+ " col(\"dti\"),\n",
+ " col(\"borrower_credit_score\"),\n",
+ " col(\"first_home_buyer\"),\n",
+ " col(\"loan_purpose\"),\n",
+ " col(\"property_type\"),\n",
+ " col(\"num_units\"),\n",
+ " col(\"occupancy_status\"),\n",
+ " col(\"property_state\"),\n",
+ " col(\"zip\"),\n",
+ " col(\"mortgage_insurance_percent\"),\n",
+ " col(\"product_type\"),\n",
+ " col(\"coborrow_credit_score\"),\n",
+ " col(\"mortgage_insurance_type\"),\n",
+ " col(\"relocation_mortgage_indicator\"),\n",
+ " dense_rank().over(Window.partitionBy(\"loan_id\").orderBy(to_date(col(\"monthly_reporting_period\"),\"MMyyyy\"))).alias(\"rank\"),\n",
+ " col('quarter')\n",
+ " )\n",
+ "\n",
+ " return acqDf.select(\"*\").filter(col(\"rank\")==1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "* Define function to read Performance CSV data file"
+ "### 2. Define ETL Process\n",
+ "\n",
+ "Define the function to do the ETL process\n",
+ "\n",
+ "#### 2.1 Define Functions to Read Raw CSV File\n",
+ "\n",
+ "* Define function to get quarter from input CSV file name"
]
},
{
"cell_type": "code",
- "execution_count": 48,
+ "execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
- "def read_perf_csv(spark, path):\n",
- " return spark.read.format('csv') \\\n",
- " .option('nullValue', '') \\\n",
- " .option('header', 'false') \\\n",
- " .option('delimiter', '|') \\\n",
- " .schema(_csv_perf_schema) \\\n",
- " .load(path) \\\n",
- " .withColumn('quarter', _get_quarter_from_csv_file_name())"
+ "def _get_quarter_from_csv_file_name():\n",
+ " return substring_index(substring_index(input_file_name(), '.', 1), '/', -1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "* Define function to read Acquisition CSV file"
+ "* Define function to read raw CSV data file"
]
},
{
"cell_type": "code",
- "execution_count": 49,
+ "execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
- "def read_acq_csv(spark, path):\n",
+ "def read_raw_csv(spark, path):\n",
" return spark.read.format('csv') \\\n",
" .option('nullValue', '') \\\n",
- " .option('header', 'false') \\\n",
+ " .option('header', False) \\\n",
" .option('delimiter', '|') \\\n",
- " .schema(_csv_acq_schema) \\\n",
+ " .schema(_csv_raw_schema) \\\n",
" .load(path) \\\n",
" .withColumn('quarter', _get_quarter_from_csv_file_name())"
]
@@ -376,7 +490,7 @@
},
{
"cell_type": "code",
- "execution_count": 50,
+ "execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
@@ -402,7 +516,7 @@
},
{
"cell_type": "code",
- "execution_count": 51,
+ "execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
@@ -481,7 +595,7 @@
},
{
"cell_type": "code",
- "execution_count": 52,
+ "execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
@@ -528,7 +642,7 @@
},
{
"cell_type": "code",
- "execution_count": 53,
+ "execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
@@ -552,7 +666,7 @@
},
{
"cell_type": "code",
- "execution_count": 54,
+ "execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
@@ -587,7 +701,7 @@
},
{
"cell_type": "code",
- "execution_count": 55,
+ "execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
@@ -615,31 +729,13 @@
},
{
"cell_type": "code",
- "execution_count": 56,
+ "execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"# You need to update them to your real paths!\n",
- "dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n",
- "orig_perf_path=dataRoot + '/mortgage/Performance/'\n",
- "orig_acq_path=dataRoot + '/mortgage/Acquisition/'"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "* Define temporary folder path "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 57,
- "metadata": {},
- "outputs": [],
- "source": [
- "tmp_perf_path=dataRoot + '/mortgage/perf/'\n",
- "tmp_acq_path=dataRoot + '/mortgage/acq/'"
+ "dataRoot = os.getenv(\"DATA_ROOT\", \"/mortgage\")\n",
+ "orig_raw_path = dataRoot + '/input/'"
]
},
{
@@ -651,11 +747,14 @@
},
{
"cell_type": "code",
- "execution_count": 59,
+ "execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
- "output_path=dataRoot + '/mortgage/output/'"
+ "output_path = dataRoot + '/output/data/'\n",
+ "output_path_train = dataRoot + '/output/train/'\n",
+ "output_path_eval = dataRoot + '/output/eval/'\n",
+ "save_train_eval_dataset = True"
]
},
{
@@ -667,12 +766,11 @@
},
{
"cell_type": "code",
- "execution_count": 60,
+ "execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"spark.conf.set('spark.rapids.sql.explain', 'ALL')\n",
- "spark.conf.set('spark.rapids.sql.incompatibleOps.enabled', 'true')\n",
"spark.conf.set('spark.rapids.sql.batchSizeBytes', '512M')\n",
"spark.conf.set('spark.rapids.sql.reader.batchSizeBytes', '768M')"
]
@@ -681,50 +779,25 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Run Part\n",
- "### Read Raw File and Transcode Data\n",
- "#### 1. Add additional Spark settings"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 61,
- "metadata": {},
- "outputs": [],
- "source": [
- "# we want a few big files instead of lots of small files\n",
- "spark.conf.set('spark.sql.files.maxPartitionBytes', '200G')"
+ "## Run Part"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "#### 2. Read Raw File and Transcode to Parquet"
+ "### Read Raw File"
]
},
{
"cell_type": "code",
- "execution_count": 62,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "6.568682670593262\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "start = time.time()\n",
- "# read data and transcode to qarquet\n",
- "acq = read_acq_csv(spark, orig_acq_path)\n",
- "acq.repartition(12).write.parquet(tmp_acq_path, mode='overwrite')\n",
- "perf = read_perf_csv(spark, orig_perf_path)\n",
- "perf.coalesce(96).write.parquet(tmp_perf_path, mode='overwrite')\n",
- "end = time.time()\n",
- "print(end - start)"
+ "rawDf = read_raw_csv(spark, orig_raw_path)\n",
+ "acq = extract_acq_columns(rawDf)\n",
+ "perf = extract_perf_columns(rawDf)"
]
},
{
@@ -737,7 +810,7 @@
},
{
"cell_type": "code",
- "execution_count": 63,
+ "execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
@@ -746,7 +819,9 @@
"# CPU run, set to false\n",
"# spark.conf.set('spark.rapids.sql.enabled', 'false')\n",
"spark.conf.set('spark.sql.files.maxPartitionBytes', '1G')\n",
- "spark.conf.set('spark.sql.shuffle.partitions', '192')"
+ "spark.conf.set(\"spark.rapids.sql.hasNans\", \"false\")\n",
+ "# use GPU to read CSV\n",
+ "spark.conf.set(\"spark.rapids.sql.csv.read.double.enabled\", \"true\")"
]
},
{
@@ -758,7 +833,7 @@
},
{
"cell_type": "code",
- "execution_count": 64,
+ "execution_count": 61,
"metadata": {},
"outputs": [
{
@@ -766,786 +841,881 @@
"output_type": "stream",
"text": [
"== Physical Plan ==\n",
- "*(5) GpuColumnarToRow false\n",
- "+- !GpuProject [gpucoalesce(orig_channel#27851, 0) AS orig_channel#29615, gpucoalesce(first_home_buyer#28053, 0) AS first_home_buyer#29616, gpucoalesce(loan_purpose#28255, 0) AS loan_purpose#29617, gpucoalesce(property_type#28457, 0) AS property_type#29618, gpucoalesce(occupancy_status#28659, 0) AS occupancy_status#29619, gpucoalesce(property_state#28861, 0) AS property_state#29620, gpucoalesce(relocation_mortgage_indicator#29063, 0) AS relocation_mortgage_indicator#29621, gpucoalesce(seller_name#29265, 0) AS seller_name#29622, gpucoalesce(id#27657, 0) AS mod_flag#29623, gpucoalesce(gpunanvl(orig_interest_rate#26291, null), 0.0) AS orig_interest_rate#29624, gpucoalesce(orig_upb#26292, 0) AS orig_upb#29625, gpucoalesce(orig_loan_term#26293, 0) AS orig_loan_term#29626, gpucoalesce(gpunanvl(orig_ltv#26296, null), 0.0) AS orig_ltv#29627, gpucoalesce(gpunanvl(orig_cltv#26297, null), 0.0) AS orig_cltv#29628, gpucoalesce(gpunanvl(num_borrowers#26298, null), 0.0) AS num_borrowers#29629, gpucoalesce(gpunanvl(dti#26299, null), 0.0) AS dti#29630, gpucoalesce(gpunanvl(borrower_credit_score#26300, null), 0.0) AS borrower_credit_score#29631, gpucoalesce(num_units#26304, 0) AS num_units#29632, gpucoalesce(zip#26307, 0) AS zip#29633, gpucoalesce(gpunanvl(mortgage_insurance_percent#26308, null), 0.0) AS mortgage_insurance_percent#29634, gpucoalesce(current_loan_delinquency_status#26234, 0) AS current_loan_delinquency_status#29635, gpucoalesce(gpunanvl(current_actual_upb#26228, null), 0.0) AS current_actual_upb#29636, gpucoalesce(gpunanvl(interest_rate#26227, null), 0.0) AS interest_rate#29637, gpucoalesce(gpunanvl(loan_age#26229, null), 0.0) AS loan_age#29638, ... 3 more fields]\n",
- " +- !GpuBroadcastHashJoin [mod_flag#26235], [mod_flag#29333], LeftOuter, BuildRight\n",
- " :- !GpuProject [interest_rate#26227, current_actual_upb#26228, loan_age#26229, msa#26233, current_loan_delinquency_status#26234, mod_flag#26235, non_interest_bearing_upb#26250, delinquency_12#27036, orig_interest_rate#26291, orig_upb#26292, orig_loan_term#26293, orig_ltv#26296, orig_cltv#26297, num_borrowers#26298, dti#26299, borrower_credit_score#26300, num_units#26304, zip#26307, mortgage_insurance_percent#26308, orig_channel#27851, first_home_buyer#28053, loan_purpose#28255, property_type#28457, occupancy_status#28659, ... 3 more fields]\n",
- " : +- !GpuBroadcastHashJoin [seller_name#27396], [seller_name#29131], LeftOuter, BuildRight\n",
- " : :- !GpuProject [interest_rate#26227, current_actual_upb#26228, loan_age#26229, msa#26233, current_loan_delinquency_status#26234, mod_flag#26235, non_interest_bearing_upb#26250, delinquency_12#27036, seller_name#27396, orig_interest_rate#26291, orig_upb#26292, orig_loan_term#26293, orig_ltv#26296, orig_cltv#26297, num_borrowers#26298, dti#26299, borrower_credit_score#26300, num_units#26304, zip#26307, mortgage_insurance_percent#26308, orig_channel#27851, first_home_buyer#28053, loan_purpose#28255, property_type#28457, ... 3 more fields]\n",
- " : : +- !GpuBroadcastHashJoin [relocation_mortgage_indicator#26312], [relocation_mortgage_indicator#28929], LeftOuter, BuildRight\n",
- " : : :- !GpuProject [interest_rate#26227, current_actual_upb#26228, loan_age#26229, msa#26233, current_loan_delinquency_status#26234, mod_flag#26235, non_interest_bearing_upb#26250, delinquency_12#27036, seller_name#27396, orig_interest_rate#26291, orig_upb#26292, orig_loan_term#26293, orig_ltv#26296, orig_cltv#26297, num_borrowers#26298, dti#26299, borrower_credit_score#26300, num_units#26304, zip#26307, mortgage_insurance_percent#26308, relocation_mortgage_indicator#26312, orig_channel#27851, first_home_buyer#28053, loan_purpose#28255, ... 3 more fields]\n",
- " : : : +- !GpuBroadcastHashJoin [property_state#26306], [property_state#28727], LeftOuter, BuildRight\n",
- " : : : :- !GpuProject [interest_rate#26227, current_actual_upb#26228, loan_age#26229, msa#26233, current_loan_delinquency_status#26234, mod_flag#26235, non_interest_bearing_upb#26250, delinquency_12#27036, seller_name#27396, orig_interest_rate#26291, orig_upb#26292, orig_loan_term#26293, orig_ltv#26296, orig_cltv#26297, num_borrowers#26298, dti#26299, borrower_credit_score#26300, num_units#26304, property_state#26306, zip#26307, mortgage_insurance_percent#26308, relocation_mortgage_indicator#26312, orig_channel#27851, first_home_buyer#28053, ... 3 more fields]\n",
- " : : : : +- !GpuBroadcastHashJoin [occupancy_status#26305], [occupancy_status#28525], LeftOuter, BuildRight\n",
- " : : : : :- !GpuProject [interest_rate#26227, current_actual_upb#26228, loan_age#26229, msa#26233, current_loan_delinquency_status#26234, mod_flag#26235, non_interest_bearing_upb#26250, delinquency_12#27036, seller_name#27396, orig_interest_rate#26291, orig_upb#26292, orig_loan_term#26293, orig_ltv#26296, orig_cltv#26297, num_borrowers#26298, dti#26299, borrower_credit_score#26300, num_units#26304, occupancy_status#26305, property_state#26306, zip#26307, mortgage_insurance_percent#26308, relocation_mortgage_indicator#26312, orig_channel#27851, ... 3 more fields]\n",
- " : : : : : +- !GpuBroadcastHashJoin [property_type#26303], [property_type#28323], LeftOuter, BuildRight\n",
- " : : : : : :- !GpuProject [interest_rate#26227, current_actual_upb#26228, loan_age#26229, msa#26233, current_loan_delinquency_status#26234, mod_flag#26235, non_interest_bearing_upb#26250, delinquency_12#27036, seller_name#27396, orig_interest_rate#26291, orig_upb#26292, orig_loan_term#26293, orig_ltv#26296, orig_cltv#26297, num_borrowers#26298, dti#26299, borrower_credit_score#26300, property_type#26303, num_units#26304, occupancy_status#26305, property_state#26306, zip#26307, mortgage_insurance_percent#26308, relocation_mortgage_indicator#26312, ... 3 more fields]\n",
- " : : : : : : +- !GpuBroadcastHashJoin [loan_purpose#26302], [loan_purpose#28121], LeftOuter, BuildRight\n",
- " : : : : : : :- !GpuProject [interest_rate#26227, current_actual_upb#26228, loan_age#26229, msa#26233, current_loan_delinquency_status#26234, mod_flag#26235, non_interest_bearing_upb#26250, delinquency_12#27036, seller_name#27396, orig_interest_rate#26291, orig_upb#26292, orig_loan_term#26293, orig_ltv#26296, orig_cltv#26297, num_borrowers#26298, dti#26299, borrower_credit_score#26300, loan_purpose#26302, property_type#26303, num_units#26304, occupancy_status#26305, property_state#26306, zip#26307, mortgage_insurance_percent#26308, ... 3 more fields]\n",
- " : : : : : : : +- !GpuBroadcastHashJoin [first_home_buyer#26301], [first_home_buyer#27919], LeftOuter, BuildRight\n",
- " : : : : : : : :- !GpuProject [interest_rate#26227, current_actual_upb#26228, loan_age#26229, msa#26233, current_loan_delinquency_status#26234, mod_flag#26235, non_interest_bearing_upb#26250, delinquency_12#27036, seller_name#27396, orig_interest_rate#26291, orig_upb#26292, orig_loan_term#26293, orig_ltv#26296, orig_cltv#26297, num_borrowers#26298, dti#26299, borrower_credit_score#26300, first_home_buyer#26301, loan_purpose#26302, property_type#26303, num_units#26304, occupancy_status#26305, property_state#26306, zip#26307, ... 3 more fields]\n",
- " : : : : : : : : +- !GpuBroadcastHashJoin [orig_channel#26289], [orig_channel#27717], LeftOuter, BuildRight\n",
- " : : : : : : : : :- !GpuProject [interest_rate#26227, current_actual_upb#26228, loan_age#26229, msa#26233, current_loan_delinquency_status#26234, mod_flag#26235, non_interest_bearing_upb#26250, delinquency_12#27036, orig_channel#26289, seller_name#27396, orig_interest_rate#26291, orig_upb#26292, orig_loan_term#26293, orig_ltv#26296, orig_cltv#26297, num_borrowers#26298, dti#26299, borrower_credit_score#26300, first_home_buyer#26301, loan_purpose#26302, property_type#26303, num_units#26304, occupancy_status#26305, property_state#26306, ... 3 more fields]\n",
- " : : : : : : : : : +- !GpuShuffledHashJoin [loan_id#26224L, quarter#26255], [loan_id#26288L, quarter#26313], Inner, BuildRight\n",
- " : : : : : : : : : :- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : : : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(loan_id#26224L, quarter#26255, 192), true, [id=#17112]\n",
- " : : : : : : : : : : +- !GpuProject [quarter#26255, loan_id#26224L, interest_rate#26227, current_actual_upb#26228, loan_age#26229, msa#26233, current_loan_delinquency_status#26234, mod_flag#26235, non_interest_bearing_upb#26250, delinquency_12#27036]\n",
- " : : : : : : : : : : +- !GpuShuffledHashJoin [quarter#26255, loan_id#26224L, cast(timestamp_year#27100 as bigint), cast(timestamp_month#27064 as bigint)], [quarter#27167, loan_id#27136L, timestamp_year#26990L, timestamp_month#27019L], LeftOuter, BuildRight\n",
- " : : : : : : : : : : :- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : : : : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#26255, loan_id#26224L, cast(timestamp_year#27100 as bigint), cast(timestamp_month#27064 as bigint), 192), true, [id=#17081]\n",
- " : : : : : : : : : : : +- GpuRowToColumnar TargetSize(536870912)\n",
- " : : : : : : : : : : : +- *(1) Project [loan_id#26224L, interest_rate#26227, current_actual_upb#26228, loan_age#26229, msa#26233, current_loan_delinquency_status#26234, mod_flag#26235, non_interest_bearing_upb#26250, quarter#26255, month(cast(cast(unix_timestamp(monthly_reporting_period#26225, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_month#27064, year(cast(cast(unix_timestamp(monthly_reporting_period#26225, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_year#27100]\n",
- " : : : : : : : : : : : +- *(1) GpuColumnarToRow false\n",
- " : : : : : : : : : : : +- !GpuFilter (gpuisnotnull(loan_id#26224L) AND gpuisnotnull(quarter#26255))\n",
- " : : : : : : : : : : : +- GpuFileScan parquet [loan_id#26224L,monthly_reporting_period#26225,interest_rate#26227,current_actual_upb#26228,loan_age#26229,msa#26233,current_loan_delinquency_status#26234,mod_flag#26235,non_interest_bearing_upb#26250,quarter#26255] Batched: true, DataFilters: [isnotnull(loan_id#26224L), isnotnull(quarter#26255)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_30#26658, CASE WHEN (current_loan_delinquency_status#26895 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_90#26659, CASE WHEN (current_loan_delinquency_status#26895 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_180#26660]\n",
- " : : : : : : : : : : +- *(3) GpuColumnarToRow false\n",
- " : : : : : : : : : : +- !GpuFilter (gpuisnotnull(loan_id#26885L) AND gpuisnotnull(quarter#26916))\n",
- " : : : : : : : : : : +- GpuFileScan parquet [loan_id#26885L,monthly_reporting_period#26886,current_loan_delinquency_status#26895,quarter#26916] Batched: true, DataFilters: [isnotnull(loan_id#26885L), isnotnull(quarter#26916)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
- " : : : : : : : : : +- GpuCoalesceBatches RequireSingleBatch\n",
- " : : : : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, timestamp_year#26990L, timestamp_month#27019L, 192), true, [id=#15140]\n",
- " : : : : : : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n",
- " : : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931, 192), true, [id=#15137]\n",
- " : : : : : : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n",
- " : : : : : : : : : +- !GpuProject [quarter#27167, FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) AS josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, loan_id#27136L, month_y#26931]\n",
- " : : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : : : : : : : +- !GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast((month_y#26931 - 1) as bigint)) as double) / 12.0))))\n",
- " : : : : : : : : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680], [month_y#26931]\n",
- " : : : : : : : : : +- !GpuProject [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680]\n",
- " : : : : : : : : : +- !GpuBroadcastHashJoin [loan_id#27136L, quarter#27167], [loan_id#26885L, quarter#26916], LeftOuter, BuildRight\n",
- " : : : : : : : : : :- GpuRowToColumnar TargetSize(536870912)\n",
- " : : : : : : : : : : +- *(2) Project [quarter#27167, loan_id#27136L, month(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_month#26734, year(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_year#26770]\n",
- " : : : : : : : : : : +- *(2) GpuColumnarToRow false\n",
- " : : : : : : : : : : +- !GpuFilter (gpuisnotnull(loan_id#27136L) AND gpuisnotnull(quarter#27167))\n",
- " : : : : : : : : : : +- GpuFileScan parquet [loan_id#27136L,monthly_reporting_period#27137,quarter#27167] Batched: true, DataFilters: [isnotnull(loan_id#27136L), isnotnull(quarter#27167)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
- " : : : : : : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true])), [id=#15129]\n",
- " : : : : : : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[gpumax(current_loan_delinquency_status#26895), gpumin(delinquency_30#26658), gpumin(delinquency_90#26659), gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n",
- " : : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#26916, loan_id#26885L, 192), true, [id=#15126]\n",
- " : : : : : : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[partial_gpumax(current_loan_delinquency_status#26895), partial_gpumin(delinquency_30#26658), partial_gpumin(delinquency_90#26659), partial_gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n",
- " : : : : : : : : : +- GpuRowToColumnar TargetSize(536870912)\n",
- " : : : : : : : : : +- *(3) Project [quarter#26916, loan_id#26885L, current_loan_delinquency_status#26895, CASE WHEN (current_loan_delinquency_status#26895 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_30#26658, CASE WHEN (current_loan_delinquency_status#26895 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_90#26659, CASE WHEN (current_loan_delinquency_status#26895 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_180#26660]\n",
- " : : : : : : : : : +- *(3) GpuColumnarToRow false\n",
- " : : : : : : : : : +- !GpuFilter (gpuisnotnull(loan_id#26885L) AND gpuisnotnull(quarter#26916))\n",
- " : : : : : : : : : +- GpuFileScan parquet [loan_id#26885L,monthly_reporting_period#26886,current_loan_delinquency_status#26895,quarter#26916] Batched: true, DataFilters: [isnotnull(loan_id#26885L), isnotnull(quarter#26916)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
- " : : : : : : : : +- GpuCoalesceBatches RequireSingleBatch\n",
- " : : : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, timestamp_year#26990L, timestamp_month#27019L, 192), true, [id=#15140]\n",
- " : : : : : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n",
- " : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931, 192), true, [id=#15137]\n",
- " : : : : : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n",
- " : : : : : : : : +- !GpuProject [quarter#27167, FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) AS josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, loan_id#27136L, month_y#26931]\n",
- " : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : : : : : : +- !GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast((month_y#26931 - 1) as bigint)) as double) / 12.0))))\n",
- " : : : : : : : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680], [month_y#26931]\n",
- " : : : : : : : : +- !GpuProject [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680]\n",
- " : : : : : : : : +- !GpuBroadcastHashJoin [loan_id#27136L, quarter#27167], [loan_id#26885L, quarter#26916], LeftOuter, BuildRight\n",
- " : : : : : : : : :- GpuRowToColumnar TargetSize(536870912)\n",
- " : : : : : : : : : +- *(2) Project [quarter#27167, loan_id#27136L, month(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_month#26734, year(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_year#26770]\n",
- " : : : : : : : : : +- *(2) GpuColumnarToRow false\n",
- " : : : : : : : : : +- !GpuFilter (gpuisnotnull(loan_id#27136L) AND gpuisnotnull(quarter#27167))\n",
- " : : : : : : : : : +- GpuFileScan parquet [loan_id#27136L,monthly_reporting_period#27137,quarter#27167] Batched: true, DataFilters: [isnotnull(loan_id#27136L), isnotnull(quarter#27167)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
- " : : : : : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true])), [id=#15129]\n",
- " : : : : : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[gpumax(current_loan_delinquency_status#26895), gpumin(delinquency_30#26658), gpumin(delinquency_90#26659), gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n",
- " : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#26916, loan_id#26885L, 192), true, [id=#15126]\n",
- " : : : : : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[partial_gpumax(current_loan_delinquency_status#26895), partial_gpumin(delinquency_30#26658), partial_gpumin(delinquency_90#26659), partial_gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n",
- " : : : : : : : : +- GpuRowToColumnar TargetSize(536870912)\n",
- " : : : : : : : : +- *(3) Project [quarter#26916, loan_id#26885L, current_loan_delinquency_status#26895, CASE WHEN (current_loan_delinquency_status#26895 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_30#26658, CASE WHEN (current_loan_delinquency_status#26895 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_90#26659, CASE WHEN (current_loan_delinquency_status#26895 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_180#26660]\n",
- " : : : : : : : : +- *(3) GpuColumnarToRow false\n",
- " : : : : : : : : +- !GpuFilter (gpuisnotnull(loan_id#26885L) AND gpuisnotnull(quarter#26916))\n",
- " : : : : : : : : +- GpuFileScan parquet [loan_id#26885L,monthly_reporting_period#26886,current_loan_delinquency_status#26895,quarter#26916] Batched: true, DataFilters: [isnotnull(loan_id#26885L), isnotnull(quarter#26916)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
- " : : : : : : : +- GpuCoalesceBatches RequireSingleBatch\n",
- " : : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, timestamp_year#26990L, timestamp_month#27019L, 192), true, [id=#15140]\n",
- " : : : : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n",
- " : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931, 192), true, [id=#15137]\n",
- " : : : : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n",
- " : : : : : : : +- !GpuProject [quarter#27167, FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) AS josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, loan_id#27136L, month_y#26931]\n",
- " : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : : : : : +- !GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast((month_y#26931 - 1) as bigint)) as double) / 12.0))))\n",
- " : : : : : : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680], [month_y#26931]\n",
- " : : : : : : : +- !GpuProject [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680]\n",
- " : : : : : : : +- !GpuBroadcastHashJoin [loan_id#27136L, quarter#27167], [loan_id#26885L, quarter#26916], LeftOuter, BuildRight\n",
- " : : : : : : : :- GpuRowToColumnar TargetSize(536870912)\n",
- " : : : : : : : : +- *(2) Project [quarter#27167, loan_id#27136L, month(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_month#26734, year(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_year#26770]\n",
- " : : : : : : : : +- *(2) GpuColumnarToRow false\n",
- " : : : : : : : : +- !GpuFilter (gpuisnotnull(loan_id#27136L) AND gpuisnotnull(quarter#27167))\n",
- " : : : : : : : : +- GpuFileScan parquet [loan_id#27136L,monthly_reporting_period#27137,quarter#27167] Batched: true, DataFilters: [isnotnull(loan_id#27136L), isnotnull(quarter#27167)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
- " : : : : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true])), [id=#15129]\n",
- " : : : : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[gpumax(current_loan_delinquency_status#26895), gpumin(delinquency_30#26658), gpumin(delinquency_90#26659), gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n",
- " : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#26916, loan_id#26885L, 192), true, [id=#15126]\n",
- " : : : : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[partial_gpumax(current_loan_delinquency_status#26895), partial_gpumin(delinquency_30#26658), partial_gpumin(delinquency_90#26659), partial_gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n",
- " : : : : : : : +- GpuRowToColumnar TargetSize(536870912)\n",
- " : : : : : : : +- *(3) Project [quarter#26916, loan_id#26885L, current_loan_delinquency_status#26895, CASE WHEN (current_loan_delinquency_status#26895 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_30#26658, CASE WHEN (current_loan_delinquency_status#26895 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_90#26659, CASE WHEN (current_loan_delinquency_status#26895 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_180#26660]\n",
- " : : : : : : : +- *(3) GpuColumnarToRow false\n",
- " : : : : : : : +- !GpuFilter (gpuisnotnull(loan_id#26885L) AND gpuisnotnull(quarter#26916))\n",
- " : : : : : : : +- GpuFileScan parquet [loan_id#26885L,monthly_reporting_period#26886,current_loan_delinquency_status#26895,quarter#26916] Batched: true, DataFilters: [isnotnull(loan_id#26885L), isnotnull(quarter#26916)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
- " : : : : : : +- GpuCoalesceBatches RequireSingleBatch\n",
- " : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, timestamp_year#26990L, timestamp_month#27019L, 192), true, [id=#15140]\n",
- " : : : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n",
- " : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931, 192), true, [id=#15137]\n",
- " : : : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n",
- " : : : : : : +- !GpuProject [quarter#27167, FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) AS josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, loan_id#27136L, month_y#26931]\n",
- " : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : : : : +- !GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast((month_y#26931 - 1) as bigint)) as double) / 12.0))))\n",
- " : : : : : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680], [month_y#26931]\n",
- " : : : : : : +- !GpuProject [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680]\n",
- " : : : : : : +- !GpuBroadcastHashJoin [loan_id#27136L, quarter#27167], [loan_id#26885L, quarter#26916], LeftOuter, BuildRight\n",
- " : : : : : : :- GpuRowToColumnar TargetSize(536870912)\n",
- " : : : : : : : +- *(2) Project [quarter#27167, loan_id#27136L, month(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_month#26734, year(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_year#26770]\n",
- " : : : : : : : +- *(2) GpuColumnarToRow false\n",
- " : : : : : : : +- !GpuFilter (gpuisnotnull(loan_id#27136L) AND gpuisnotnull(quarter#27167))\n",
- " : : : : : : : +- GpuFileScan parquet [loan_id#27136L,monthly_reporting_period#27137,quarter#27167] Batched: true, DataFilters: [isnotnull(loan_id#27136L), isnotnull(quarter#27167)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
- " : : : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true])), [id=#15129]\n",
- " : : : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[gpumax(current_loan_delinquency_status#26895), gpumin(delinquency_30#26658), gpumin(delinquency_90#26659), gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n",
- " : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#26916, loan_id#26885L, 192), true, [id=#15126]\n",
- " : : : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[partial_gpumax(current_loan_delinquency_status#26895), partial_gpumin(delinquency_30#26658), partial_gpumin(delinquency_90#26659), partial_gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n",
- " : : : : : : +- GpuRowToColumnar TargetSize(536870912)\n",
- " : : : : : : +- *(3) Project [quarter#26916, loan_id#26885L, current_loan_delinquency_status#26895, CASE WHEN (current_loan_delinquency_status#26895 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_30#26658, CASE WHEN (current_loan_delinquency_status#26895 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_90#26659, CASE WHEN (current_loan_delinquency_status#26895 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_180#26660]\n",
- " : : : : : : +- *(3) GpuColumnarToRow false\n",
- " : : : : : : +- !GpuFilter (gpuisnotnull(loan_id#26885L) AND gpuisnotnull(quarter#26916))\n",
- " : : : : : : +- GpuFileScan parquet [loan_id#26885L,monthly_reporting_period#26886,current_loan_delinquency_status#26895,quarter#26916] Batched: true, DataFilters: [isnotnull(loan_id#26885L), isnotnull(quarter#26916)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
- " : : : : : +- GpuCoalesceBatches RequireSingleBatch\n",
- " : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, timestamp_year#26990L, timestamp_month#27019L, 192), true, [id=#15140]\n",
- " : : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n",
- " : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931, 192), true, [id=#15137]\n",
- " : : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n",
- " : : : : : +- !GpuProject [quarter#27167, FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) AS josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, loan_id#27136L, month_y#26931]\n",
- " : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : : : +- !GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast((month_y#26931 - 1) as bigint)) as double) / 12.0))))\n",
- " : : : : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680], [month_y#26931]\n",
- " : : : : : +- !GpuProject [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680]\n",
- " : : : : : +- !GpuBroadcastHashJoin [loan_id#27136L, quarter#27167], [loan_id#26885L, quarter#26916], LeftOuter, BuildRight\n",
- " : : : : : :- GpuRowToColumnar TargetSize(536870912)\n",
- " : : : : : : +- *(2) Project [quarter#27167, loan_id#27136L, month(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_month#26734, year(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_year#26770]\n",
- " : : : : : : +- *(2) GpuColumnarToRow false\n",
- " : : : : : : +- !GpuFilter (gpuisnotnull(loan_id#27136L) AND gpuisnotnull(quarter#27167))\n",
- " : : : : : : +- GpuFileScan parquet [loan_id#27136L,monthly_reporting_period#27137,quarter#27167] Batched: true, DataFilters: [isnotnull(loan_id#27136L), isnotnull(quarter#27167)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
- " : : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true])), [id=#15129]\n",
- " : : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[gpumax(current_loan_delinquency_status#26895), gpumin(delinquency_30#26658), gpumin(delinquency_90#26659), gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n",
- " : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#26916, loan_id#26885L, 192), true, [id=#15126]\n",
- " : : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[partial_gpumax(current_loan_delinquency_status#26895), partial_gpumin(delinquency_30#26658), partial_gpumin(delinquency_90#26659), partial_gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n",
- " : : : : : +- GpuRowToColumnar TargetSize(536870912)\n",
- " : : : : : +- *(3) Project [quarter#26916, loan_id#26885L, current_loan_delinquency_status#26895, CASE WHEN (current_loan_delinquency_status#26895 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_30#26658, CASE WHEN (current_loan_delinquency_status#26895 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_90#26659, CASE WHEN (current_loan_delinquency_status#26895 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_180#26660]\n",
- " : : : : : +- *(3) GpuColumnarToRow false\n",
- " : : : : : +- !GpuFilter (gpuisnotnull(loan_id#26885L) AND gpuisnotnull(quarter#26916))\n",
- " : : : : : +- GpuFileScan parquet [loan_id#26885L,monthly_reporting_period#26886,current_loan_delinquency_status#26895,quarter#26916] Batched: true, DataFilters: [isnotnull(loan_id#26885L), isnotnull(quarter#26916)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
- " : : : : +- GpuCoalesceBatches RequireSingleBatch\n",
- " : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, timestamp_year#26990L, timestamp_month#27019L, 192), true, [id=#15140]\n",
- " : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n",
- " : : : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931, 192), true, [id=#15137]\n",
- " : : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n",
- " : : : : +- !GpuProject [quarter#27167, FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) AS josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, loan_id#27136L, month_y#26931]\n",
- " : : : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : : +- !GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast((month_y#26931 - 1) as bigint)) as double) / 12.0))))\n",
- " : : : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680], [month_y#26931]\n",
- " : : : : +- !GpuProject [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680]\n",
- " : : : : +- !GpuBroadcastHashJoin [loan_id#27136L, quarter#27167], [loan_id#26885L, quarter#26916], LeftOuter, BuildRight\n",
- " : : : : :- GpuRowToColumnar TargetSize(536870912)\n",
- " : : : : : +- *(2) Project [quarter#27167, loan_id#27136L, month(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_month#26734, year(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_year#26770]\n",
- " : : : : : +- *(2) GpuColumnarToRow false\n",
- " : : : : : +- !GpuFilter (gpuisnotnull(loan_id#27136L) AND gpuisnotnull(quarter#27167))\n",
- " : : : : : +- GpuFileScan parquet [loan_id#27136L,monthly_reporting_period#27137,quarter#27167] Batched: true, DataFilters: [isnotnull(loan_id#27136L), isnotnull(quarter#27167)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
- " : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true])), [id=#15129]\n",
- " : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[gpumax(current_loan_delinquency_status#26895), gpumin(delinquency_30#26658), gpumin(delinquency_90#26659), gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n",
- " : : : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#26916, loan_id#26885L, 192), true, [id=#15126]\n",
- " : : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[partial_gpumax(current_loan_delinquency_status#26895), partial_gpumin(delinquency_30#26658), partial_gpumin(delinquency_90#26659), partial_gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n",
- " : : : : +- GpuRowToColumnar TargetSize(536870912)\n",
- " : : : : +- *(3) Project [quarter#26916, loan_id#26885L, current_loan_delinquency_status#26895, CASE WHEN (current_loan_delinquency_status#26895 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_30#26658, CASE WHEN (current_loan_delinquency_status#26895 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_90#26659, CASE WHEN (current_loan_delinquency_status#26895 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_180#26660]\n",
- " : : : : +- *(3) GpuColumnarToRow false\n",
- " : : : : +- !GpuFilter (gpuisnotnull(loan_id#26885L) AND gpuisnotnull(quarter#26916))\n",
- " : : : : +- GpuFileScan parquet [loan_id#26885L,monthly_reporting_period#26886,current_loan_delinquency_status#26895,quarter#26916] Batched: true, DataFilters: [isnotnull(loan_id#26885L), isnotnull(quarter#26916)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
- " : : : +- GpuCoalesceBatches RequireSingleBatch\n",
- " : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, timestamp_year#26990L, timestamp_month#27019L, 192), true, [id=#15140]\n",
- " : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n",
- " : : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931, 192), true, [id=#15137]\n",
- " : : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n",
- " : : : +- !GpuProject [quarter#27167, FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) AS josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, loan_id#27136L, month_y#26931]\n",
- " : : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : +- !GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast((month_y#26931 - 1) as bigint)) as double) / 12.0))))\n",
- " : : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680], [month_y#26931]\n",
- " : : : +- !GpuProject [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680]\n",
- " : : : +- !GpuBroadcastHashJoin [loan_id#27136L, quarter#27167], [loan_id#26885L, quarter#26916], LeftOuter, BuildRight\n",
- " : : : :- GpuRowToColumnar TargetSize(536870912)\n",
- " : : : : +- *(2) Project [quarter#27167, loan_id#27136L, month(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_month#26734, year(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_year#26770]\n",
- " : : : : +- *(2) GpuColumnarToRow false\n",
- " : : : : +- !GpuFilter (gpuisnotnull(loan_id#27136L) AND gpuisnotnull(quarter#27167))\n",
- " : : : : +- GpuFileScan parquet [loan_id#27136L,monthly_reporting_period#27137,quarter#27167] Batched: true, DataFilters: [isnotnull(loan_id#27136L), isnotnull(quarter#27167)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
- " : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true])), [id=#15129]\n",
- " : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[gpumax(current_loan_delinquency_status#26895), gpumin(delinquency_30#26658), gpumin(delinquency_90#26659), gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n",
- " : : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#26916, loan_id#26885L, 192), true, [id=#15126]\n",
- " : : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[partial_gpumax(current_loan_delinquency_status#26895), partial_gpumin(delinquency_30#26658), partial_gpumin(delinquency_90#26659), partial_gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n",
- " : : : +- GpuRowToColumnar TargetSize(536870912)\n",
- " : : : +- *(3) Project [quarter#26916, loan_id#26885L, current_loan_delinquency_status#26895, CASE WHEN (current_loan_delinquency_status#26895 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_30#26658, CASE WHEN (current_loan_delinquency_status#26895 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_90#26659, CASE WHEN (current_loan_delinquency_status#26895 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_180#26660]\n",
- " : : : +- *(3) GpuColumnarToRow false\n",
- " : : : +- !GpuFilter (gpuisnotnull(loan_id#26885L) AND gpuisnotnull(quarter#26916))\n",
- " : : : +- GpuFileScan parquet [loan_id#26885L,monthly_reporting_period#26886,current_loan_delinquency_status#26895,quarter#26916] Batched: true, DataFilters: [isnotnull(loan_id#26885L), isnotnull(quarter#26916)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
- " : : +- GpuCoalesceBatches RequireSingleBatch\n",
- " : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, timestamp_year#26990L, timestamp_month#27019L, 192), true, [id=#15140]\n",
- " : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n",
- " : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931, 192), true, [id=#15137]\n",
- " : : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n",
- " : : +- !GpuProject [quarter#27167, FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) AS josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, loan_id#27136L, month_y#26931]\n",
- " : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : +- !GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast((month_y#26931 - 1) as bigint)) as double) / 12.0))))\n",
- " : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680], [month_y#26931]\n",
- " : : +- !GpuProject [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680]\n",
- " : : +- !GpuBroadcastHashJoin [loan_id#27136L, quarter#27167], [loan_id#26885L, quarter#26916], LeftOuter, BuildRight\n",
- " : : :- GpuRowToColumnar TargetSize(536870912)\n",
- " : : : +- *(2) Project [quarter#27167, loan_id#27136L, month(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_month#26734, year(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_year#26770]\n",
- " : : : +- *(2) GpuColumnarToRow false\n",
- " : : : +- !GpuFilter (gpuisnotnull(loan_id#27136L) AND gpuisnotnull(quarter#27167))\n",
- " : : : +- GpuFileScan parquet [loan_id#27136L,monthly_reporting_period#27137,quarter#27167] Batched: true, DataFilters: [isnotnull(loan_id#27136L), isnotnull(quarter#27167)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
- " : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true])), [id=#15129]\n",
- " : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[gpumax(current_loan_delinquency_status#26895), gpumin(delinquency_30#26658), gpumin(delinquency_90#26659), gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n",
- " : : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : : +- !GpuColumnarExchange gpuhashpartitioning(quarter#26916, loan_id#26885L, 192), true, [id=#15126]\n",
- " : : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[partial_gpumax(current_loan_delinquency_status#26895), partial_gpumin(delinquency_30#26658), partial_gpumin(delinquency_90#26659), partial_gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n",
- " : : +- GpuRowToColumnar TargetSize(536870912)\n",
- " : : +- *(3) Project [quarter#26916, loan_id#26885L, current_loan_delinquency_status#26895, CASE WHEN (current_loan_delinquency_status#26895 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_30#26658, CASE WHEN (current_loan_delinquency_status#26895 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_90#26659, CASE WHEN (current_loan_delinquency_status#26895 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_180#26660]\n",
- " : : +- *(3) GpuColumnarToRow false\n",
- " : : +- !GpuFilter (gpuisnotnull(loan_id#26885L) AND gpuisnotnull(quarter#26916))\n",
- " : : +- GpuFileScan parquet [loan_id#26885L,monthly_reporting_period#26886,current_loan_delinquency_status#26895,quarter#26916] Batched: true, DataFilters: [isnotnull(loan_id#26885L), isnotnull(quarter#26916)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
- " : +- GpuCoalesceBatches RequireSingleBatch\n",
- " : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, timestamp_year#26990L, timestamp_month#27019L, 192), true, [id=#15140]\n",
- " : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n",
- " : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : +- !GpuColumnarExchange gpuhashpartitioning(quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931, 192), true, [id=#15137]\n",
- " : +- GpuHashAggregate(keys=[quarter#27167, loan_id#27136L, josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, month_y#26931], functions=[]), filters=ArrayBuffer())\n",
- " : +- !GpuProject [quarter#27167, FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) AS josh_mody_n#26947L, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680, loan_id#27136L, month_y#26931]\n",
- " : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : +- !GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast(month_y#26931 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#26770 * 12) + timestamp_month#26734) - 24000) - month_y#26931) as double) / 12.0)) * 12)) + cast((month_y#26931 - 1) as bigint)) as double) / 12.0))))\n",
- " : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680], [month_y#26931]\n",
- " : +- !GpuProject [loan_id#27136L, quarter#27167, timestamp_month#26734, timestamp_year#26770, ever_30#26687, ever_90#26688, ever_180#26689, delinquency_30#26676, delinquency_90#26678, delinquency_180#26680]\n",
- " : +- !GpuBroadcastHashJoin [loan_id#27136L, quarter#27167], [loan_id#26885L, quarter#26916], LeftOuter, BuildRight\n",
- " : :- GpuRowToColumnar TargetSize(536870912)\n",
- " : : +- *(2) Project [quarter#27167, loan_id#27136L, month(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_month#26734, year(cast(cast(unix_timestamp(monthly_reporting_period#27137, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date)) AS timestamp_year#26770]\n",
- " : : +- *(2) GpuColumnarToRow false\n",
- " : : +- !GpuFilter (gpuisnotnull(loan_id#27136L) AND gpuisnotnull(quarter#27167))\n",
- " : : +- GpuFileScan parquet [loan_id#27136L,monthly_reporting_period#27137,quarter#27167] Batched: true, DataFilters: [isnotnull(loan_id#27136L), isnotnull(quarter#27167)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
- " : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true])), [id=#15129]\n",
- " : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[gpumax(current_loan_delinquency_status#26895), gpumin(delinquency_30#26658), gpumin(delinquency_90#26659), gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n",
- " : +- GpuCoalesceBatches TargetSize(536870912)\n",
- " : +- !GpuColumnarExchange gpuhashpartitioning(quarter#26916, loan_id#26885L, 192), true, [id=#15126]\n",
- " : +- GpuHashAggregate(keys=[quarter#26916, loan_id#26885L], functions=[partial_gpumax(current_loan_delinquency_status#26895), partial_gpumin(delinquency_30#26658), partial_gpumin(delinquency_90#26659), partial_gpumin(delinquency_180#26660)]), filters=ArrayBuffer(None, None, None, None))\n",
- " : +- GpuRowToColumnar TargetSize(536870912)\n",
- " : +- *(3) Project [quarter#26916, loan_id#26885L, current_loan_delinquency_status#26895, CASE WHEN (current_loan_delinquency_status#26895 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_30#26658, CASE WHEN (current_loan_delinquency_status#26895 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_90#26659, CASE WHEN (current_loan_delinquency_status#26895 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#26886, MM/dd/yyyy, Some(Asia/Shanghai)) as timestamp) as date) END AS delinquency_180#26660]\n",
- " : +- *(3) GpuColumnarToRow false\n",
- " : +- !GpuFilter (gpuisnotnull(loan_id#26885L) AND gpuisnotnull(quarter#26916))\n",
- " : +- GpuFileScan parquet [loan_id#26885L,monthly_reporting_period#26886,current_loan_delinquency_status#26895,quarter#26916] Batched: true, DataFilters: [isnotnull(loan_id#26885L), isnotnull(quarter#26916)], Format: Parquet, Location: InMemoryFileIndex[file:/home/mengmengg/xgboost4j_spark/data/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct= 1) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_30#1975, CASE WHEN (current_loan_delinquency_status#2212 >= 3) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_90#1976, CASE WHEN (current_loan_delinquency_status#2212 >= 6) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_180#1977]\n",
+ " : : : : : : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : : : : : : +- GpuFilter (gpuisnotnull(loan_id#2202L) AND gpuisnotnull(quarter#2233)), true\n",
+ " : : : : : : : : : : : +- GpuFileGpuScan parquet [loan_id#2202L,monthly_reporting_period#2203,current_loan_delinquency_status#2212,quarter#2233] Batched: true, DataFilters: [isnotnull(loan_id#2202L), isnotnull(quarter#2233)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
+ " : : : : : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : : : : : +- GpuSort [quarter#2484 ASC NULLS FIRST, loan_id#2453L ASC NULLS FIRST, timestamp_year#2307L ASC NULLS FIRST, timestamp_month#2336L ASC NULLS FIRST], false, com.nvidia.spark.rapids.OutOfCoreSort$@163d9f7d\n",
+ " : : : : : : : : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, timestamp_year#2307L, timestamp_month#2336L, 192), ENSURE_REQUIREMENTS, [id=#1114]\n",
+ " : : : : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : : : : : : +- *(6) HashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[])\n",
+ " : : : : : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : : : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248, 192), ENSURE_REQUIREMENTS, [id=#1107]\n",
+ " : : : : : : : : : : +- GpuHashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[]), filters=ArrayBuffer())\n",
+ " : : : : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : : : : : : +- *(5) Project [quarter#2484, FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) AS josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, loan_id#2453L, month_y#2248]\n",
+ " : : : : : : : : : : +- *(5) Filter (isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast((month_y#2248 - 1) as bigint)) as double) / 12.0))) AND isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) END))\n",
+ " : : : : : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : : : : : +- GpuGenerate gpuexplode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997], false, [month_y#2248]\n",
+ " : : : : : : : : : : +- GpuProject [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997]\n",
+ " : : : : : : : : : : +- GpuBroadcastHashJoin [loan_id#2453L, quarter#2484], [loan_id#2202L, quarter#2233], LeftOuter, GpuBuildRight\n",
+ " : : : : : : : : : : :- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : : : : : : : +- *(3) Project [quarter#2484, loan_id#2453L, month(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_month#2051, year(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_year#2087]\n",
+ " : : : : : : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : : : : : : +- GpuFilter (gpuisnotnull(quarter#2484) AND gpuisnotnull(loan_id#2453L)), true\n",
+ " : : : : : : : : : : : +- GpuFileGpuScan parquet [loan_id#2453L,monthly_reporting_period#2454,quarter#2484] Batched: true, DataFilters: [isnotnull(quarter#2484), isnotnull(loan_id#2453L)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n",
+ " : : : : : : : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true]),false), [id=#1096]\n",
+ " : : : : : : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[gpumax(current_loan_delinquency_status#2212), gpumin(delinquency_30#1975), gpumin(delinquency_90#1976), gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n",
+ " : : : : : : : : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2233, loan_id#2202L, 192), ENSURE_REQUIREMENTS, [id=#1093]\n",
+ " : : : : : : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[partial_gpumax(current_loan_delinquency_status#2212), partial_gpumin(delinquency_30#1975), partial_gpumin(delinquency_90#1976), partial_gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n",
+ " : : : : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : : : : : : +- *(4) Project [quarter#2233, loan_id#2202L, current_loan_delinquency_status#2212, CASE WHEN (current_loan_delinquency_status#2212 >= 1) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_30#1975, CASE WHEN (current_loan_delinquency_status#2212 >= 3) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_90#1976, CASE WHEN (current_loan_delinquency_status#2212 >= 6) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_180#1977]\n",
+ " : : : : : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : : : : : +- GpuFilter (gpuisnotnull(loan_id#2202L) AND gpuisnotnull(quarter#2233)), true\n",
+ " : : : : : : : : : : +- GpuFileGpuScan parquet [loan_id#2202L,monthly_reporting_period#2203,current_loan_delinquency_status#2212,quarter#2233] Batched: true, DataFilters: [isnotnull(loan_id#2202L), isnotnull(quarter#2233)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
+ " : : : : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : : : : +- GpuSort [quarter#2484 ASC NULLS FIRST, loan_id#2453L ASC NULLS FIRST, timestamp_year#2307L ASC NULLS FIRST, timestamp_month#2336L ASC NULLS FIRST], false, com.nvidia.spark.rapids.OutOfCoreSort$@163d9f7d\n",
+ " : : : : : : : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, timestamp_year#2307L, timestamp_month#2336L, 192), ENSURE_REQUIREMENTS, [id=#1114]\n",
+ " : : : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : : : : : +- *(6) HashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[])\n",
+ " : : : : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248, 192), ENSURE_REQUIREMENTS, [id=#1107]\n",
+ " : : : : : : : : : +- GpuHashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[]), filters=ArrayBuffer())\n",
+ " : : : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : : : : : +- *(5) Project [quarter#2484, FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) AS josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, loan_id#2453L, month_y#2248]\n",
+ " : : : : : : : : : +- *(5) Filter (isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast((month_y#2248 - 1) as bigint)) as double) / 12.0))) AND isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) END))\n",
+ " : : : : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : : : : +- GpuGenerate gpuexplode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997], false, [month_y#2248]\n",
+ " : : : : : : : : : +- GpuProject [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997]\n",
+ " : : : : : : : : : +- GpuBroadcastHashJoin [loan_id#2453L, quarter#2484], [loan_id#2202L, quarter#2233], LeftOuter, GpuBuildRight\n",
+ " : : : : : : : : : :- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : : : : : : +- *(3) Project [quarter#2484, loan_id#2453L, month(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_month#2051, year(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_year#2087]\n",
+ " : : : : : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : : : : : +- GpuFilter (gpuisnotnull(quarter#2484) AND gpuisnotnull(loan_id#2453L)), true\n",
+ " : : : : : : : : : : +- GpuFileGpuScan parquet [loan_id#2453L,monthly_reporting_period#2454,quarter#2484] Batched: true, DataFilters: [isnotnull(quarter#2484), isnotnull(loan_id#2453L)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n",
+ " : : : : : : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true]),false), [id=#1096]\n",
+ " : : : : : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[gpumax(current_loan_delinquency_status#2212), gpumin(delinquency_30#1975), gpumin(delinquency_90#1976), gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n",
+ " : : : : : : : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2233, loan_id#2202L, 192), ENSURE_REQUIREMENTS, [id=#1093]\n",
+ " : : : : : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[partial_gpumax(current_loan_delinquency_status#2212), partial_gpumin(delinquency_30#1975), partial_gpumin(delinquency_90#1976), partial_gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n",
+ " : : : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : : : : : +- *(4) Project [quarter#2233, loan_id#2202L, current_loan_delinquency_status#2212, CASE WHEN (current_loan_delinquency_status#2212 >= 1) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_30#1975, CASE WHEN (current_loan_delinquency_status#2212 >= 3) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_90#1976, CASE WHEN (current_loan_delinquency_status#2212 >= 6) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_180#1977]\n",
+ " : : : : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : : : : +- GpuFilter (gpuisnotnull(loan_id#2202L) AND gpuisnotnull(quarter#2233)), true\n",
+ " : : : : : : : : : +- GpuFileGpuScan parquet [loan_id#2202L,monthly_reporting_period#2203,current_loan_delinquency_status#2212,quarter#2233] Batched: true, DataFilters: [isnotnull(loan_id#2202L), isnotnull(quarter#2233)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
+ " : : : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : : : +- GpuSort [quarter#2484 ASC NULLS FIRST, loan_id#2453L ASC NULLS FIRST, timestamp_year#2307L ASC NULLS FIRST, timestamp_month#2336L ASC NULLS FIRST], false, com.nvidia.spark.rapids.OutOfCoreSort$@163d9f7d\n",
+ " : : : : : : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, timestamp_year#2307L, timestamp_month#2336L, 192), ENSURE_REQUIREMENTS, [id=#1114]\n",
+ " : : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : : : : +- *(6) HashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[])\n",
+ " : : : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248, 192), ENSURE_REQUIREMENTS, [id=#1107]\n",
+ " : : : : : : : : +- GpuHashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[]), filters=ArrayBuffer())\n",
+ " : : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : : : : +- *(5) Project [quarter#2484, FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) AS josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, loan_id#2453L, month_y#2248]\n",
+ " : : : : : : : : +- *(5) Filter (isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast((month_y#2248 - 1) as bigint)) as double) / 12.0))) AND isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) END))\n",
+ " : : : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : : : +- GpuGenerate gpuexplode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997], false, [month_y#2248]\n",
+ " : : : : : : : : +- GpuProject [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997]\n",
+ " : : : : : : : : +- GpuBroadcastHashJoin [loan_id#2453L, quarter#2484], [loan_id#2202L, quarter#2233], LeftOuter, GpuBuildRight\n",
+ " : : : : : : : : :- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : : : : : +- *(3) Project [quarter#2484, loan_id#2453L, month(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_month#2051, year(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_year#2087]\n",
+ " : : : : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : : : : +- GpuFilter (gpuisnotnull(quarter#2484) AND gpuisnotnull(loan_id#2453L)), true\n",
+ " : : : : : : : : : +- GpuFileGpuScan parquet [loan_id#2453L,monthly_reporting_period#2454,quarter#2484] Batched: true, DataFilters: [isnotnull(quarter#2484), isnotnull(loan_id#2453L)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n",
+ " : : : : : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true]),false), [id=#1096]\n",
+ " : : : : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[gpumax(current_loan_delinquency_status#2212), gpumin(delinquency_30#1975), gpumin(delinquency_90#1976), gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n",
+ " : : : : : : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2233, loan_id#2202L, 192), ENSURE_REQUIREMENTS, [id=#1093]\n",
+ " : : : : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[partial_gpumax(current_loan_delinquency_status#2212), partial_gpumin(delinquency_30#1975), partial_gpumin(delinquency_90#1976), partial_gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n",
+ " : : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : : : : +- *(4) Project [quarter#2233, loan_id#2202L, current_loan_delinquency_status#2212, CASE WHEN (current_loan_delinquency_status#2212 >= 1) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_30#1975, CASE WHEN (current_loan_delinquency_status#2212 >= 3) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_90#1976, CASE WHEN (current_loan_delinquency_status#2212 >= 6) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_180#1977]\n",
+ " : : : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : : : +- GpuFilter (gpuisnotnull(loan_id#2202L) AND gpuisnotnull(quarter#2233)), true\n",
+ " : : : : : : : : +- GpuFileGpuScan parquet [loan_id#2202L,monthly_reporting_period#2203,current_loan_delinquency_status#2212,quarter#2233] Batched: true, DataFilters: [isnotnull(loan_id#2202L), isnotnull(quarter#2233)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
+ " : : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : : +- GpuSort [quarter#2484 ASC NULLS FIRST, loan_id#2453L ASC NULLS FIRST, timestamp_year#2307L ASC NULLS FIRST, timestamp_month#2336L ASC NULLS FIRST], false, com.nvidia.spark.rapids.OutOfCoreSort$@163d9f7d\n",
+ " : : : : : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, timestamp_year#2307L, timestamp_month#2336L, 192), ENSURE_REQUIREMENTS, [id=#1114]\n",
+ " : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : : : +- *(6) HashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[])\n",
+ " : : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248, 192), ENSURE_REQUIREMENTS, [id=#1107]\n",
+ " : : : : : : : +- GpuHashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[]), filters=ArrayBuffer())\n",
+ " : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : : : +- *(5) Project [quarter#2484, FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) AS josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, loan_id#2453L, month_y#2248]\n",
+ " : : : : : : : +- *(5) Filter (isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast((month_y#2248 - 1) as bigint)) as double) / 12.0))) AND isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) END))\n",
+ " : : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : : +- GpuGenerate gpuexplode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997], false, [month_y#2248]\n",
+ " : : : : : : : +- GpuProject [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997]\n",
+ " : : : : : : : +- GpuBroadcastHashJoin [loan_id#2453L, quarter#2484], [loan_id#2202L, quarter#2233], LeftOuter, GpuBuildRight\n",
+ " : : : : : : : :- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : : : : +- *(3) Project [quarter#2484, loan_id#2453L, month(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_month#2051, year(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_year#2087]\n",
+ " : : : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : : : +- GpuFilter (gpuisnotnull(quarter#2484) AND gpuisnotnull(loan_id#2453L)), true\n",
+ " : : : : : : : : +- GpuFileGpuScan parquet [loan_id#2453L,monthly_reporting_period#2454,quarter#2484] Batched: true, DataFilters: [isnotnull(quarter#2484), isnotnull(loan_id#2453L)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n",
+ " : : : : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true]),false), [id=#1096]\n",
+ " : : : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[gpumax(current_loan_delinquency_status#2212), gpumin(delinquency_30#1975), gpumin(delinquency_90#1976), gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n",
+ " : : : : : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2233, loan_id#2202L, 192), ENSURE_REQUIREMENTS, [id=#1093]\n",
+ " : : : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[partial_gpumax(current_loan_delinquency_status#2212), partial_gpumin(delinquency_30#1975), partial_gpumin(delinquency_90#1976), partial_gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n",
+ " : : : : : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : : : +- *(4) Project [quarter#2233, loan_id#2202L, current_loan_delinquency_status#2212, CASE WHEN (current_loan_delinquency_status#2212 >= 1) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_30#1975, CASE WHEN (current_loan_delinquency_status#2212 >= 3) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_90#1976, CASE WHEN (current_loan_delinquency_status#2212 >= 6) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_180#1977]\n",
+ " : : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : : +- GpuFilter (gpuisnotnull(loan_id#2202L) AND gpuisnotnull(quarter#2233)), true\n",
+ " : : : : : : : +- GpuFileGpuScan parquet [loan_id#2202L,monthly_reporting_period#2203,current_loan_delinquency_status#2212,quarter#2233] Batched: true, DataFilters: [isnotnull(loan_id#2202L), isnotnull(quarter#2233)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
+ " : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : +- GpuSort [quarter#2484 ASC NULLS FIRST, loan_id#2453L ASC NULLS FIRST, timestamp_year#2307L ASC NULLS FIRST, timestamp_month#2336L ASC NULLS FIRST], false, com.nvidia.spark.rapids.OutOfCoreSort$@163d9f7d\n",
+ " : : : : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, timestamp_year#2307L, timestamp_month#2336L, 192), ENSURE_REQUIREMENTS, [id=#1114]\n",
+ " : : : : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : : +- *(6) HashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[])\n",
+ " : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248, 192), ENSURE_REQUIREMENTS, [id=#1107]\n",
+ " : : : : : : +- GpuHashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[]), filters=ArrayBuffer())\n",
+ " : : : : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : : +- *(5) Project [quarter#2484, FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) AS josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, loan_id#2453L, month_y#2248]\n",
+ " : : : : : : +- *(5) Filter (isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast((month_y#2248 - 1) as bigint)) as double) / 12.0))) AND isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) END))\n",
+ " : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : +- GpuGenerate gpuexplode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997], false, [month_y#2248]\n",
+ " : : : : : : +- GpuProject [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997]\n",
+ " : : : : : : +- GpuBroadcastHashJoin [loan_id#2453L, quarter#2484], [loan_id#2202L, quarter#2233], LeftOuter, GpuBuildRight\n",
+ " : : : : : : :- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : : : +- *(3) Project [quarter#2484, loan_id#2453L, month(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_month#2051, year(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_year#2087]\n",
+ " : : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : : +- GpuFilter (gpuisnotnull(quarter#2484) AND gpuisnotnull(loan_id#2453L)), true\n",
+ " : : : : : : : +- GpuFileGpuScan parquet [loan_id#2453L,monthly_reporting_period#2454,quarter#2484] Batched: true, DataFilters: [isnotnull(quarter#2484), isnotnull(loan_id#2453L)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n",
+ " : : : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true]),false), [id=#1096]\n",
+ " : : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[gpumax(current_loan_delinquency_status#2212), gpumin(delinquency_30#1975), gpumin(delinquency_90#1976), gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n",
+ " : : : : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2233, loan_id#2202L, 192), ENSURE_REQUIREMENTS, [id=#1093]\n",
+ " : : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[partial_gpumax(current_loan_delinquency_status#2212), partial_gpumin(delinquency_30#1975), partial_gpumin(delinquency_90#1976), partial_gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n",
+ " : : : : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : : +- *(4) Project [quarter#2233, loan_id#2202L, current_loan_delinquency_status#2212, CASE WHEN (current_loan_delinquency_status#2212 >= 1) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_30#1975, CASE WHEN (current_loan_delinquency_status#2212 >= 3) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_90#1976, CASE WHEN (current_loan_delinquency_status#2212 >= 6) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_180#1977]\n",
+ " : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : +- GpuFilter (gpuisnotnull(loan_id#2202L) AND gpuisnotnull(quarter#2233)), true\n",
+ " : : : : : : +- GpuFileGpuScan parquet [loan_id#2202L,monthly_reporting_period#2203,current_loan_delinquency_status#2212,quarter#2233] Batched: true, DataFilters: [isnotnull(loan_id#2202L), isnotnull(quarter#2233)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
+ " : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : +- GpuSort [quarter#2484 ASC NULLS FIRST, loan_id#2453L ASC NULLS FIRST, timestamp_year#2307L ASC NULLS FIRST, timestamp_month#2336L ASC NULLS FIRST], false, com.nvidia.spark.rapids.OutOfCoreSort$@163d9f7d\n",
+ " : : : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, timestamp_year#2307L, timestamp_month#2336L, 192), ENSURE_REQUIREMENTS, [id=#1114]\n",
+ " : : : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : +- *(6) HashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[])\n",
+ " : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248, 192), ENSURE_REQUIREMENTS, [id=#1107]\n",
+ " : : : : : +- GpuHashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[]), filters=ArrayBuffer())\n",
+ " : : : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : +- *(5) Project [quarter#2484, FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) AS josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, loan_id#2453L, month_y#2248]\n",
+ " : : : : : +- *(5) Filter (isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast((month_y#2248 - 1) as bigint)) as double) / 12.0))) AND isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) END))\n",
+ " : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : +- GpuGenerate gpuexplode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997], false, [month_y#2248]\n",
+ " : : : : : +- GpuProject [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997]\n",
+ " : : : : : +- GpuBroadcastHashJoin [loan_id#2453L, quarter#2484], [loan_id#2202L, quarter#2233], LeftOuter, GpuBuildRight\n",
+ " : : : : : :- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : : +- *(3) Project [quarter#2484, loan_id#2453L, month(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_month#2051, year(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_year#2087]\n",
+ " : : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : : +- GpuFilter (gpuisnotnull(quarter#2484) AND gpuisnotnull(loan_id#2453L)), true\n",
+ " : : : : : : +- GpuFileGpuScan parquet [loan_id#2453L,monthly_reporting_period#2454,quarter#2484] Batched: true, DataFilters: [isnotnull(quarter#2484), isnotnull(loan_id#2453L)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n",
+ " : : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true]),false), [id=#1096]\n",
+ " : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[gpumax(current_loan_delinquency_status#2212), gpumin(delinquency_30#1975), gpumin(delinquency_90#1976), gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n",
+ " : : : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2233, loan_id#2202L, 192), ENSURE_REQUIREMENTS, [id=#1093]\n",
+ " : : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[partial_gpumax(current_loan_delinquency_status#2212), partial_gpumin(delinquency_30#1975), partial_gpumin(delinquency_90#1976), partial_gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n",
+ " : : : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : +- *(4) Project [quarter#2233, loan_id#2202L, current_loan_delinquency_status#2212, CASE WHEN (current_loan_delinquency_status#2212 >= 1) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_30#1975, CASE WHEN (current_loan_delinquency_status#2212 >= 3) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_90#1976, CASE WHEN (current_loan_delinquency_status#2212 >= 6) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_180#1977]\n",
+ " : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : +- GpuFilter (gpuisnotnull(loan_id#2202L) AND gpuisnotnull(quarter#2233)), true\n",
+ " : : : : : +- GpuFileGpuScan parquet [loan_id#2202L,monthly_reporting_period#2203,current_loan_delinquency_status#2212,quarter#2233] Batched: true, DataFilters: [isnotnull(loan_id#2202L), isnotnull(quarter#2233)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
+ " : : : : +- GpuColumnarToRow false\n",
+ " : : : : +- GpuSort [quarter#2484 ASC NULLS FIRST, loan_id#2453L ASC NULLS FIRST, timestamp_year#2307L ASC NULLS FIRST, timestamp_month#2336L ASC NULLS FIRST], false, com.nvidia.spark.rapids.OutOfCoreSort$@163d9f7d\n",
+ " : : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, timestamp_year#2307L, timestamp_month#2336L, 192), ENSURE_REQUIREMENTS, [id=#1114]\n",
+ " : : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : +- *(6) HashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[])\n",
+ " : : : : +- GpuColumnarToRow false\n",
+ " : : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248, 192), ENSURE_REQUIREMENTS, [id=#1107]\n",
+ " : : : : +- GpuHashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[]), filters=ArrayBuffer())\n",
+ " : : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : +- *(5) Project [quarter#2484, FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) AS josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, loan_id#2453L, month_y#2248]\n",
+ " : : : : +- *(5) Filter (isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast((month_y#2248 - 1) as bigint)) as double) / 12.0))) AND isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) END))\n",
+ " : : : : +- GpuColumnarToRow false\n",
+ " : : : : +- GpuGenerate gpuexplode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997], false, [month_y#2248]\n",
+ " : : : : +- GpuProject [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997]\n",
+ " : : : : +- GpuBroadcastHashJoin [loan_id#2453L, quarter#2484], [loan_id#2202L, quarter#2233], LeftOuter, GpuBuildRight\n",
+ " : : : : :- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : : +- *(3) Project [quarter#2484, loan_id#2453L, month(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_month#2051, year(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_year#2087]\n",
+ " : : : : : +- GpuColumnarToRow false\n",
+ " : : : : : +- GpuFilter (gpuisnotnull(quarter#2484) AND gpuisnotnull(loan_id#2453L)), true\n",
+ " : : : : : +- GpuFileGpuScan parquet [loan_id#2453L,monthly_reporting_period#2454,quarter#2484] Batched: true, DataFilters: [isnotnull(quarter#2484), isnotnull(loan_id#2453L)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n",
+ " : : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true]),false), [id=#1096]\n",
+ " : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[gpumax(current_loan_delinquency_status#2212), gpumin(delinquency_30#1975), gpumin(delinquency_90#1976), gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n",
+ " : : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2233, loan_id#2202L, 192), ENSURE_REQUIREMENTS, [id=#1093]\n",
+ " : : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[partial_gpumax(current_loan_delinquency_status#2212), partial_gpumin(delinquency_30#1975), partial_gpumin(delinquency_90#1976), partial_gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n",
+ " : : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : +- *(4) Project [quarter#2233, loan_id#2202L, current_loan_delinquency_status#2212, CASE WHEN (current_loan_delinquency_status#2212 >= 1) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_30#1975, CASE WHEN (current_loan_delinquency_status#2212 >= 3) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_90#1976, CASE WHEN (current_loan_delinquency_status#2212 >= 6) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_180#1977]\n",
+ " : : : : +- GpuColumnarToRow false\n",
+ " : : : : +- GpuFilter (gpuisnotnull(loan_id#2202L) AND gpuisnotnull(quarter#2233)), true\n",
+ " : : : : +- GpuFileGpuScan parquet [loan_id#2202L,monthly_reporting_period#2203,current_loan_delinquency_status#2212,quarter#2233] Batched: true, DataFilters: [isnotnull(loan_id#2202L), isnotnull(quarter#2233)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
+ " : : : +- GpuColumnarToRow false\n",
+ " : : : +- GpuSort [quarter#2484 ASC NULLS FIRST, loan_id#2453L ASC NULLS FIRST, timestamp_year#2307L ASC NULLS FIRST, timestamp_month#2336L ASC NULLS FIRST], false, com.nvidia.spark.rapids.OutOfCoreSort$@163d9f7d\n",
+ " : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, timestamp_year#2307L, timestamp_month#2336L, 192), ENSURE_REQUIREMENTS, [id=#1114]\n",
+ " : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : +- *(6) HashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[])\n",
+ " : : : +- GpuColumnarToRow false\n",
+ " : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248, 192), ENSURE_REQUIREMENTS, [id=#1107]\n",
+ " : : : +- GpuHashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[]), filters=ArrayBuffer())\n",
+ " : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : +- *(5) Project [quarter#2484, FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) AS josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, loan_id#2453L, month_y#2248]\n",
+ " : : : +- *(5) Filter (isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast((month_y#2248 - 1) as bigint)) as double) / 12.0))) AND isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) END))\n",
+ " : : : +- GpuColumnarToRow false\n",
+ " : : : +- GpuGenerate gpuexplode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997], false, [month_y#2248]\n",
+ " : : : +- GpuProject [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997]\n",
+ " : : : +- GpuBroadcastHashJoin [loan_id#2453L, quarter#2484], [loan_id#2202L, quarter#2233], LeftOuter, GpuBuildRight\n",
+ " : : : :- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : : +- *(3) Project [quarter#2484, loan_id#2453L, month(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_month#2051, year(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_year#2087]\n",
+ " : : : : +- GpuColumnarToRow false\n",
+ " : : : : +- GpuFilter (gpuisnotnull(quarter#2484) AND gpuisnotnull(loan_id#2453L)), true\n",
+ " : : : : +- GpuFileGpuScan parquet [loan_id#2453L,monthly_reporting_period#2454,quarter#2484] Batched: true, DataFilters: [isnotnull(quarter#2484), isnotnull(loan_id#2453L)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n",
+ " : : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true]),false), [id=#1096]\n",
+ " : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[gpumax(current_loan_delinquency_status#2212), gpumin(delinquency_30#1975), gpumin(delinquency_90#1976), gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n",
+ " : : : +- GpuShuffleCoalesce 536870912\n",
+ " : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2233, loan_id#2202L, 192), ENSURE_REQUIREMENTS, [id=#1093]\n",
+ " : : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[partial_gpumax(current_loan_delinquency_status#2212), partial_gpumin(delinquency_30#1975), partial_gpumin(delinquency_90#1976), partial_gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n",
+ " : : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : +- *(4) Project [quarter#2233, loan_id#2202L, current_loan_delinquency_status#2212, CASE WHEN (current_loan_delinquency_status#2212 >= 1) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_30#1975, CASE WHEN (current_loan_delinquency_status#2212 >= 3) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_90#1976, CASE WHEN (current_loan_delinquency_status#2212 >= 6) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_180#1977]\n",
+ " : : : +- GpuColumnarToRow false\n",
+ " : : : +- GpuFilter (gpuisnotnull(loan_id#2202L) AND gpuisnotnull(quarter#2233)), true\n",
+ " : : : +- GpuFileGpuScan parquet [loan_id#2202L,monthly_reporting_period#2203,current_loan_delinquency_status#2212,quarter#2233] Batched: true, DataFilters: [isnotnull(loan_id#2202L), isnotnull(quarter#2233)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
+ " : : +- GpuColumnarToRow false\n",
+ " : : +- GpuSort [quarter#2484 ASC NULLS FIRST, loan_id#2453L ASC NULLS FIRST, timestamp_year#2307L ASC NULLS FIRST, timestamp_month#2336L ASC NULLS FIRST], false, com.nvidia.spark.rapids.OutOfCoreSort$@163d9f7d\n",
+ " : : +- GpuShuffleCoalesce 536870912\n",
+ " : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, timestamp_year#2307L, timestamp_month#2336L, 192), ENSURE_REQUIREMENTS, [id=#1114]\n",
+ " : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : +- *(6) HashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[])\n",
+ " : : +- GpuColumnarToRow false\n",
+ " : : +- GpuShuffleCoalesce 536870912\n",
+ " : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248, 192), ENSURE_REQUIREMENTS, [id=#1107]\n",
+ " : : +- GpuHashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[]), filters=ArrayBuffer())\n",
+ " : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : +- *(5) Project [quarter#2484, FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) AS josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, loan_id#2453L, month_y#2248]\n",
+ " : : +- *(5) Filter (isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast((month_y#2248 - 1) as bigint)) as double) / 12.0))) AND isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) END))\n",
+ " : : +- GpuColumnarToRow false\n",
+ " : : +- GpuGenerate gpuexplode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997], false, [month_y#2248]\n",
+ " : : +- GpuProject [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997]\n",
+ " : : +- GpuBroadcastHashJoin [loan_id#2453L, quarter#2484], [loan_id#2202L, quarter#2233], LeftOuter, GpuBuildRight\n",
+ " : : :- GpuRowToColumnar targetsize(536870912)\n",
+ " : : : +- *(3) Project [quarter#2484, loan_id#2453L, month(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_month#2051, year(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_year#2087]\n",
+ " : : : +- GpuColumnarToRow false\n",
+ " : : : +- GpuFilter (gpuisnotnull(quarter#2484) AND gpuisnotnull(loan_id#2453L)), true\n",
+ " : : : +- GpuFileGpuScan parquet [loan_id#2453L,monthly_reporting_period#2454,quarter#2484] Batched: true, DataFilters: [isnotnull(quarter#2484), isnotnull(loan_id#2453L)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n",
+ " : : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true]),false), [id=#1096]\n",
+ " : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[gpumax(current_loan_delinquency_status#2212), gpumin(delinquency_30#1975), gpumin(delinquency_90#1976), gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n",
+ " : : +- GpuShuffleCoalesce 536870912\n",
+ " : : +- GpuColumnarExchange gpuhashpartitioning(quarter#2233, loan_id#2202L, 192), ENSURE_REQUIREMENTS, [id=#1093]\n",
+ " : : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[partial_gpumax(current_loan_delinquency_status#2212), partial_gpumin(delinquency_30#1975), partial_gpumin(delinquency_90#1976), partial_gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n",
+ " : : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : : +- *(4) Project [quarter#2233, loan_id#2202L, current_loan_delinquency_status#2212, CASE WHEN (current_loan_delinquency_status#2212 >= 1) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_30#1975, CASE WHEN (current_loan_delinquency_status#2212 >= 3) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_90#1976, CASE WHEN (current_loan_delinquency_status#2212 >= 6) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_180#1977]\n",
+ " : : +- GpuColumnarToRow false\n",
+ " : : +- GpuFilter (gpuisnotnull(loan_id#2202L) AND gpuisnotnull(quarter#2233)), true\n",
+ " : : +- GpuFileGpuScan parquet [loan_id#2202L,monthly_reporting_period#2203,current_loan_delinquency_status#2212,quarter#2233] Batched: true, DataFilters: [isnotnull(loan_id#2202L), isnotnull(quarter#2233)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
+ " : +- GpuColumnarToRow false\n",
+ " : +- GpuSort [quarter#2484 ASC NULLS FIRST, loan_id#2453L ASC NULLS FIRST, timestamp_year#2307L ASC NULLS FIRST, timestamp_month#2336L ASC NULLS FIRST], false, com.nvidia.spark.rapids.OutOfCoreSort$@163d9f7d\n",
+ " : +- GpuShuffleCoalesce 536870912\n",
+ " : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, timestamp_year#2307L, timestamp_month#2336L, 192), ENSURE_REQUIREMENTS, [id=#1114]\n",
+ " : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : +- *(6) HashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[])\n",
+ " : +- GpuColumnarToRow false\n",
+ " : +- GpuShuffleCoalesce 536870912\n",
+ " : +- GpuColumnarExchange gpuhashpartitioning(quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248, 192), ENSURE_REQUIREMENTS, [id=#1107]\n",
+ " : +- GpuHashAggregate(keys=[quarter#2484, loan_id#2453L, josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, month_y#2248], functions=[]), filters=ArrayBuffer())\n",
+ " : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : +- *(5) Project [quarter#2484, FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) AS josh_mody_n#2264L, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997, loan_id#2453L, month_y#2248]\n",
+ " : +- *(5) Filter (isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast((month_y#2248 - 1) as bigint)) as double) / 12.0))) AND isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#2087 * 12) + timestamp_month#2051) - 24000) - month_y#2248) as double) / 12.0)) * 12)) + cast(month_y#2248 as bigint)), 12) END))\n",
+ " : +- GpuColumnarToRow false\n",
+ " : +- GpuGenerate gpuexplode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997], false, [month_y#2248]\n",
+ " : +- GpuProject [loan_id#2453L, quarter#2484, timestamp_month#2051, timestamp_year#2087, ever_30#2004, ever_90#2005, ever_180#2006, delinquency_30#1993, delinquency_90#1995, delinquency_180#1997]\n",
+ " : +- GpuBroadcastHashJoin [loan_id#2453L, quarter#2484], [loan_id#2202L, quarter#2233], LeftOuter, GpuBuildRight\n",
+ " : :- GpuRowToColumnar targetsize(536870912)\n",
+ " : : +- *(3) Project [quarter#2484, loan_id#2453L, month(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_month#2051, year(cast(gettimestamp(monthly_reporting_period#2454, MM/dd/yyyy, Some(America/Los_Angeles), false) as date)) AS timestamp_year#2087]\n",
+ " : : +- GpuColumnarToRow false\n",
+ " : : +- GpuFilter (gpuisnotnull(quarter#2484) AND gpuisnotnull(loan_id#2453L)), true\n",
+ " : : +- GpuFileGpuScan parquet [loan_id#2453L,monthly_reporting_period#2454,quarter#2484] Batched: true, DataFilters: [isnotnull(quarter#2484), isnotnull(loan_id#2453L)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n",
+ " : +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[1, bigint, true], input[0, string, true]),false), [id=#1096]\n",
+ " : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[gpumax(current_loan_delinquency_status#2212), gpumin(delinquency_30#1975), gpumin(delinquency_90#1976), gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n",
+ " : +- GpuShuffleCoalesce 536870912\n",
+ " : +- GpuColumnarExchange gpuhashpartitioning(quarter#2233, loan_id#2202L, 192), ENSURE_REQUIREMENTS, [id=#1093]\n",
+ " : +- GpuHashAggregate(keys=[quarter#2233, loan_id#2202L], functions=[partial_gpumax(current_loan_delinquency_status#2212), partial_gpumin(delinquency_30#1975), partial_gpumin(delinquency_90#1976), partial_gpumin(delinquency_180#1977)]), filters=ArrayBuffer(None, None, None, None))\n",
+ " : +- GpuRowToColumnar targetsize(536870912)\n",
+ " : +- *(4) Project [quarter#2233, loan_id#2202L, current_loan_delinquency_status#2212, CASE WHEN (current_loan_delinquency_status#2212 >= 1) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_30#1975, CASE WHEN (current_loan_delinquency_status#2212 >= 3) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_90#1976, CASE WHEN (current_loan_delinquency_status#2212 >= 6) THEN cast(gettimestamp(monthly_reporting_period#2203, MM/dd/yyyy, Some(America/Los_Angeles), false) as date) END AS delinquency_180#1977]\n",
+ " : +- GpuColumnarToRow false\n",
+ " : +- GpuFilter (gpuisnotnull(loan_id#2202L) AND gpuisnotnull(quarter#2233)), true\n",
+ " : +- GpuFileGpuScan parquet [loan_id#2202L,monthly_reporting_period#2203,current_loan_delinquency_status#2212,quarter#2233] Batched: true, DataFilters: [isnotnull(loan_id#2202L), isnotnull(quarter#2233)], Format: Parquet, Location: InMemoryFileIndex[file:/local/saralihalli/HOME/mortgage/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n",
+ "Download Single-Family Loan Performance Data from [Fannie Mae](https://datadynamics.fanniemae.com/data-dynamics/#/reportMenu;category=HP) website. \n",
"\n",
"### 2. Download needed jars\n",
"* [rapids-4-spark_2.12-22.06.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.06.0/rapids-4-spark_2.12-22.06.0.jar)\n",
@@ -66,38 +67,15 @@
},
{
"cell_type": "code",
- "execution_count": 2,
- "id": "674771a8",
+ "execution_count": null,
+ "id": "b2834c06",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "dataRoot = /data\n",
- "perfPath = /data/mortgage/Performance/\n",
- "acqPath = /data/mortgage/Acquisition/\n",
- "outPath = /data/mortgage/output/\n"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/plain": [
- "/data"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "val dataRoot = sys.env.getOrElse(\"DATA_ROOT\", \"/data\")\n",
- "val perfPath = dataRoot + \"/mortgage/Performance/\"\n",
- "val acqPath = dataRoot + \"/mortgage/Acquisition/\"\n",
- "val outPath = dataRoot + \"/mortgage/output/\"\n"
+ "val dataRoot = sys.env.getOrElse(\"DATA_ROOT\", \"/mortgage\")\n",
+ "val dataPath = dataRoot + \"/input\"\n",
+ "val outPath = dataRoot + \"/output\"\n",
+ "val saveTrainEvalDataset = true"
]
},
{
@@ -120,7 +98,7 @@
{
"data": {
"text/plain": [
- "performanceSchema = StructType(StructField(loan_id,LongType,true), StructField(monthly_reporting_period,StringType,true), StructField(servicer,StringType,true), StructField(interest_rate,DoubleType,true), StructField(current_actual_upb,DoubleType,true), StructField(loan_age,DoubleType,true), StructField(remaining_months_to_legal_maturity,DoubleType,true), StructField(adj_remaining_months_to_maturity,DoubleType,true), StructField(maturity_date,StringType,true), StructField(msa,DoubleType,true), StructField(current_loan_delinquency_status,IntegerType,true), StructField(mod_flag,StringType,true), StructField(zero_balance_code,StringType,true), StructField(zero_balance_effective_date,StringType,true), StructField(last_paid_installment_date,StringType,t...\n"
+ "rawSchema = StructType(StructField(reference_pool_id,StringType,true), StructField(loan_id,LongType,true), StructField(monthly_reporting_period,StringType,true), StructField(orig_channel,StringType,true), StructField(seller_name,StringType,true), StructField(servicer,StringType,true), StructField(master_servicer,StringType,true), StructField(orig_interest_rate,DoubleType,true), StructField(interest_rate,DoubleType,true), StructField(orig_upb,IntegerType,true), StructField(upb_at_issuance,StringType,true), StructField(current_actual_upb,DoubleType,true), StructField(orig_loan_term,IntegerType,true), StructField(orig_date,StringType,true), StructField(first_pay_date,StringType,true), StructField(loan_age,DoubleType,true), StructField(remaining_months...\n"
]
},
"metadata": {},
@@ -129,7 +107,7 @@
{
"data": {
"text/plain": [
- "StructType(StructField(loan_id,LongType,true), StructField(monthly_reporting_period,StringType,true), StructField(servicer,StringType,true), StructField(interest_rate,DoubleType,true), StructField(current_actual_upb,DoubleType,true), StructField(loan_age,DoubleType,true), StructField(remaining_months_to_legal_maturity,DoubleType,true), StructField(adj_remaining_months_to_maturity,DoubleType,true), StructField(maturity_date,StringType,true), StructField(msa,DoubleType,true), StructField(current_loan_delinquency_status,IntegerType,true), StructField(mod_flag,StringType,true), StructField(zero_balance_code,StringType,true), StructField(zero_balance_effective_date,StringType,true), StructField(last_paid_installment_date,StringType,t..."
+ "StructType(StructField(reference_pool_id,StringType,true), StructField(loan_id,LongType,true), StructField(monthly_reporting_period,StringType,true), StructField(orig_channel,StringType,true), StructField(seller_name,StringType,true), StructField(servicer,StringType,true), StructField(master_servicer,StringType,true), StructField(orig_interest_rate,DoubleType,true), StructField(interest_rate,DoubleType,true), StructField(orig_upb,IntegerType,true), StructField(upb_at_issuance,StringType,true), StructField(current_actual_upb,DoubleType,true), StructField(orig_loan_term,IntegerType,true), StructField(orig_date,StringType,true), StructField(first_pay_date,StringType,true), StructField(loan_age,DoubleType,true), StructField(remaining_months..."
]
},
"execution_count": 3,
@@ -139,21 +117,57 @@
],
"source": [
"// File schema\n",
- "val performanceSchema = StructType(Array(\n",
+ "val rawSchema = StructType(Array(\n",
+ " StructField(\"reference_pool_id\", StringType),\n",
" StructField(\"loan_id\", LongType),\n",
" StructField(\"monthly_reporting_period\", StringType),\n",
+ " StructField(\"orig_channel\", StringType),\n",
+ " StructField(\"seller_name\", StringType),\n",
" StructField(\"servicer\", StringType),\n",
+ " StructField(\"master_servicer\", StringType),\n",
+ " StructField(\"orig_interest_rate\", DoubleType),\n",
" StructField(\"interest_rate\", DoubleType),\n",
+ " StructField(\"orig_upb\", DoubleType),\n",
+ " StructField(\"upb_at_issuance\", StringType),\n",
" StructField(\"current_actual_upb\", DoubleType),\n",
+ " StructField(\"orig_loan_term\", IntegerType),\n",
+ " StructField(\"orig_date\", StringType),\n",
+ " StructField(\"first_pay_date\", StringType), \n",
" StructField(\"loan_age\", DoubleType),\n",
" StructField(\"remaining_months_to_legal_maturity\", DoubleType),\n",
" StructField(\"adj_remaining_months_to_maturity\", DoubleType),\n",
" StructField(\"maturity_date\", StringType),\n",
+ " StructField(\"orig_ltv\", DoubleType),\n",
+ " StructField(\"orig_cltv\", DoubleType),\n",
+ " StructField(\"num_borrowers\", DoubleType),\n",
+ " StructField(\"dti\", DoubleType),\n",
+ " StructField(\"borrower_credit_score\", DoubleType),\n",
+ " StructField(\"coborrow_credit_score\", DoubleType),\n",
+ " StructField(\"first_home_buyer\", StringType),\n",
+ " StructField(\"loan_purpose\", StringType),\n",
+ " StructField(\"property_type\", StringType),\n",
+ " StructField(\"num_units\", IntegerType),\n",
+ " StructField(\"occupancy_status\", StringType),\n",
+ " StructField(\"property_state\", StringType),\n",
" StructField(\"msa\", DoubleType),\n",
+ " StructField(\"zip\", IntegerType),\n",
+ " StructField(\"mortgage_insurance_percent\", DoubleType),\n",
+ " StructField(\"product_type\", StringType),\n",
+ " StructField(\"prepayment_penalty_indicator\", StringType),\n",
+ " StructField(\"interest_only_loan_indicator\", StringType),\n",
+ " StructField(\"interest_only_first_principal_and_interest_payment_date\", StringType),\n",
+ " StructField(\"months_to_amortization\", StringType),\n",
" StructField(\"current_loan_delinquency_status\", IntegerType),\n",
+ " StructField(\"loan_payment_history\", StringType),\n",
" StructField(\"mod_flag\", StringType),\n",
+ " StructField(\"mortgage_insurance_cancellation_indicator\", StringType),\n",
" StructField(\"zero_balance_code\", StringType),\n",
" StructField(\"zero_balance_effective_date\", StringType),\n",
+ " StructField(\"upb_at_the_time_of_removal\", StringType),\n",
+ " StructField(\"repurchase_date\", StringType),\n",
+ " StructField(\"scheduled_principal_current\", StringType),\n",
+ " StructField(\"total_principal_current\", StringType),\n",
+ " StructField(\"unscheduled_principal_current\", StringType),\n",
" StructField(\"last_paid_installment_date\", StringType),\n",
" StructField(\"foreclosed_after\", StringType),\n",
" StructField(\"disposition_date\", StringType),\n",
@@ -168,37 +182,51 @@
" StructField(\"other_foreclosure_proceeds\", DoubleType),\n",
" StructField(\"non_interest_bearing_upb\", DoubleType),\n",
" StructField(\"principal_forgiveness_upb\", StringType),\n",
- " StructField(\"repurchase_make_whole_proceeds_flag\", StringType),\n",
- " StructField(\"foreclosure_principal_write_off_amount\", StringType),\n",
- " StructField(\"servicing_activity_indicator\", StringType))\n",
- " )\n",
- "\n",
- "val acquisitionSchema = StructType(Array(\n",
- " StructField(\"loan_id\", LongType),\n",
- " StructField(\"orig_channel\", StringType),\n",
- " StructField(\"seller_name\", StringType),\n",
- " StructField(\"orig_interest_rate\", DoubleType),\n",
- " StructField(\"orig_upb\", IntegerType),\n",
- " StructField(\"orig_loan_term\", IntegerType),\n",
- " StructField(\"orig_date\", StringType),\n",
- " StructField(\"first_pay_date\", StringType),\n",
- " StructField(\"orig_ltv\", DoubleType),\n",
- " StructField(\"orig_cltv\", DoubleType),\n",
- " StructField(\"num_borrowers\", DoubleType),\n",
- " StructField(\"dti\", DoubleType),\n",
- " StructField(\"borrower_credit_score\", DoubleType),\n",
- " StructField(\"first_home_buyer\", StringType),\n",
- " StructField(\"loan_purpose\", StringType),\n",
- " StructField(\"property_type\", StringType),\n",
- " StructField(\"num_units\", IntegerType),\n",
- " StructField(\"occupancy_status\", StringType),\n",
- " StructField(\"property_state\", StringType),\n",
- " StructField(\"zip\", IntegerType),\n",
- " StructField(\"mortgage_insurance_percent\", DoubleType),\n",
- " StructField(\"product_type\", StringType),\n",
- " StructField(\"coborrow_credit_score\", DoubleType),\n",
+ " StructField(\"original_list_start_date\", StringType),\n",
+ " StructField(\"original_list_price\", StringType),\n",
+ " StructField(\"current_list_start_date\", StringType),\n",
+ " StructField(\"current_list_price\", StringType),\n",
+ " StructField(\"borrower_credit_score_at_issuance\", StringType),\n",
+ " StructField(\"co-borrower_credit_score_at_issuance\", StringType),\n",
+ " StructField(\"borrower_credit_score_current\", StringType),\n",
+ " StructField(\"co-Borrower_credit_score_current\", StringType),\n",
" StructField(\"mortgage_insurance_type\", DoubleType),\n",
- " StructField(\"relocation_mortgage_indicator\", StringType))\n",
+ " StructField(\"servicing_activity_indicator\", StringType),\n",
+ " StructField(\"current_period_modification_loss_amount\", StringType),\n",
+ " StructField(\"cumulative_modification_loss_amount\", StringType),\n",
+ " StructField(\"current_period_credit_event_net_gain_or_loss\", StringType),\n",
+ " StructField(\"cumulative_credit_event_net_gain_or_loss\", StringType),\n",
+ " StructField(\"homeready_program_indicator\", StringType),\n",
+ " StructField(\"foreclosure_principal_write_off_amount\", StringType),\n",
+ " StructField(\"relocation_mortgage_indicator\", StringType),\n",
+ " StructField(\"zero_balance_code_change_date\", StringType),\n",
+ " StructField(\"loan_holdback_indicator\", StringType),\n",
+ " StructField(\"loan_holdback_effective_date\", StringType),\n",
+ " StructField(\"delinquent_accrued_interest\", StringType),\n",
+ " StructField(\"property_valuation_method\", StringType),\n",
+ " StructField(\"high_balance_loan_indicator\", StringType),\n",
+ " StructField(\"arm_initial_fixed-rate_period_lt_5_yr_indicator\", StringType),\n",
+ " StructField(\"arm_product_type\", StringType),\n",
+ " StructField(\"initial_fixed-rate_period\", StringType),\n",
+ " StructField(\"interest_rate_adjustment_frequency\", StringType),\n",
+ " StructField(\"next_interest_rate_adjustment_date\", StringType),\n",
+ " StructField(\"next_payment_change_date\", StringType),\n",
+ " StructField(\"index\", StringType),\n",
+ " StructField(\"arm_cap_structure\", StringType),\n",
+ " StructField(\"initial_interest_rate_cap_up_percent\", StringType),\n",
+ " StructField(\"periodic_interest_rate_cap_up_percent\", StringType),\n",
+ " StructField(\"lifetime_interest_rate_cap_up_percent\", StringType),\n",
+ " StructField(\"mortgage_margin\", StringType),\n",
+ " StructField(\"arm_balloon_indicator\", StringType),\n",
+ " StructField(\"arm_plan_number\", StringType),\n",
+ " StructField(\"borrower_assistance_plan\", StringType),\n",
+ " StructField(\"hltv_refinance_option_indicator\", StringType),\n",
+ " StructField(\"deal_name\", StringType),\n",
+ " StructField(\"repurchase_make_whole_proceeds_flag\", StringType),\n",
+ " StructField(\"alternative_delinquency_resolution\", StringType),\n",
+ " StructField(\"alternative_delinquency_resolution_count\", StringType),\n",
+ " StructField(\"total_deferral_amount\", StringType)\n",
+ " )\n",
" )"
]
},
@@ -356,7 +384,7 @@
" // So we strip off the .txt and everything after it\n",
" // and then take everything after the last remaining _\n",
" def apply(): Column = substring_index(\n",
- " substring_index(input_file_name(), \".\", 1), \"_\", -1)\n",
+ " substring_index(input_file_name(), \".\", 1), \"/\", -1)\n",
"}"
]
},
@@ -413,7 +441,7 @@
"\n",
"val numericCols = List(\n",
" (\"orig_interest_rate\", FloatType),\n",
- " (\"orig_upb\", IntegerType),\n",
+ " (\"orig_upb\", DoubleType),\n",
" (\"orig_loan_term\", IntegerType),\n",
" (\"orig_ltv\", FloatType),\n",
" (\"orig_cltv\", FloatType),\n",
@@ -556,6 +584,120 @@
" }"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "9e1fbb61",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "defined object extractPerfColumns\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "object extractPerfColumns{\n",
+ " def apply(rawDf : DataFrame) : DataFrame = {\n",
+ " val perfDf = rawDf.select(\n",
+ " col(\"loan_id\"),\n",
+ " date_format(to_date(col(\"monthly_reporting_period\"),\"MMyyyy\"), \"MM/dd/yyyy\").as(\"monthly_reporting_period\"),\n",
+ " upper(col(\"servicer\")).as(\"servicer\"),\n",
+ " col(\"interest_rate\"),\n",
+ " col(\"current_actual_upb\"),\n",
+ " col(\"loan_age\"),\n",
+ " col(\"remaining_months_to_legal_maturity\"),\n",
+ " col(\"adj_remaining_months_to_maturity\"),\n",
+ " date_format(to_date(col(\"maturity_date\"),\"MMyyyy\"), \"MM/yyyy\").as(\"maturity_date\"),\n",
+ " col(\"msa\"),\n",
+ " col(\"current_loan_delinquency_status\"),\n",
+ " col(\"mod_flag\"),\n",
+ " col(\"zero_balance_code\"),\n",
+ " date_format(to_date(col(\"zero_balance_effective_date\"),\"MMyyyy\"), \"MM/yyyy\").as(\"zero_balance_effective_date\"),\n",
+ " date_format(to_date(col(\"last_paid_installment_date\"),\"MMyyyy\"), \"MM/dd/yyyy\").as(\"last_paid_installment_date\"),\n",
+ " date_format(to_date(col(\"foreclosed_after\"),\"MMyyyy\"), \"MM/dd/yyyy\").as(\"foreclosed_after\"),\n",
+ " date_format(to_date(col(\"disposition_date\"),\"MMyyyy\"), \"MM/dd/yyyy\").as(\"disposition_date\"),\n",
+ " col(\"foreclosure_costs\"),\n",
+ " col(\"prop_preservation_and_repair_costs\"),\n",
+ " col(\"asset_recovery_costs\"),\n",
+ " col(\"misc_holding_expenses\"),\n",
+ " col(\"holding_taxes\"),\n",
+ " col(\"net_sale_proceeds\"),\n",
+ " col(\"credit_enhancement_proceeds\"),\n",
+ " col(\"repurchase_make_whole_proceeds\"),\n",
+ " col(\"other_foreclosure_proceeds\"),\n",
+ " col(\"non_interest_bearing_upb\"),\n",
+ " col(\"principal_forgiveness_upb\"),\n",
+ " col(\"repurchase_make_whole_proceeds_flag\"),\n",
+ " col(\"foreclosure_principal_write_off_amount\"),\n",
+ " col(\"servicing_activity_indicator\"),\n",
+ " col(\"quarter\")\n",
+ " )\n",
+ " \n",
+ " perfDf.select(\"*\").filter(\"current_actual_upb != 0.0\")\n",
+ " }\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "ce429163",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "defined object extractAcqColumns\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "object extractAcqColumns{\n",
+ " def apply(rawDf : DataFrame) : DataFrame = {\n",
+ " val acqDf = rawDf.select(\n",
+ " col(\"loan_id\"),\n",
+ " col(\"orig_channel\"),\n",
+ " upper(col(\"seller_name\")).as(\"seller_name\"),\n",
+ " col(\"orig_interest_rate\"),\n",
+ " col(\"orig_upb\"),\n",
+ " col(\"orig_loan_term\"),\n",
+ " date_format(to_date(col(\"orig_date\"),\"MMyyyy\"), \"MM/yyyy\").as(\"orig_date\"),\n",
+ " date_format(to_date(col(\"first_pay_date\"),\"MMyyyy\"), \"MM/yyyy\").as(\"first_pay_date\"),\n",
+ " col(\"orig_ltv\"),\n",
+ " col(\"orig_cltv\"),\n",
+ " col(\"num_borrowers\"),\n",
+ " col(\"dti\"),\n",
+ " col(\"borrower_credit_score\"),\n",
+ " col(\"first_home_buyer\"),\n",
+ " col(\"loan_purpose\"),\n",
+ " col(\"property_type\"),\n",
+ " col(\"num_units\"),\n",
+ " col(\"occupancy_status\"),\n",
+ " col(\"property_state\"),\n",
+ " col(\"zip\"),\n",
+ " col(\"mortgage_insurance_percent\"),\n",
+ " col(\"product_type\"),\n",
+ " col(\"coborrow_credit_score\"),\n",
+ " col(\"mortgage_insurance_type\"),\n",
+ " col(\"relocation_mortgage_indicator\"),\n",
+ " col(\"quarter\"),\n",
+ " dense_rank().over(Window.partitionBy(\"loan_id\").orderBy(to_date(col(\"monthly_reporting_period\"),\"MMyyyy\"))).as(\"rank\")\n",
+ " )\n",
+ "\n",
+ " acqDf.select(\"*\").filter(col(\"rank\") === 1)\n",
+ " }\n",
+ "\n",
+ "}"
+ ]
+ },
{
"cell_type": "markdown",
"id": "37c64d85",
@@ -566,15 +708,15 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 11,
"id": "98d37174",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "sparkSession = org.apache.spark.sql.SparkSession@1d87c1c2\n",
- "reader = org.apache.spark.sql.DataFrameReader@2e8a7a69\n"
+ "sparkSession = org.apache.spark.sql.SparkSession@694178ec\n",
+ "reader = org.apache.spark.sql.DataFrameReader@4b2afd51\n"
]
},
"metadata": {},
@@ -583,18 +725,30 @@
{
"data": {
"text/plain": [
- "org.apache.spark.sql.DataFrameReader@2e8a7a69"
+ "org.apache.spark.sql.DataFrameReader@4b2afd51"
]
},
- "execution_count": 9,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"// Build the spark session and data reader as usual\n",
- "val sparkSession = SparkSession.builder.appName(\"mortgage-gpu\").getOrCreate\n",
- "val reader = sparkSession.read.option(\"header\", true).schema(performanceSchema)"
+ "val sparkSession = SparkSession.builder.appName(\"mortgage-gpu\").config(\"spark.sql.cache.serializer\", \"com.nvidia.spark.ParquetCachedBatchSerializer\").getOrCreate\n",
+ "\n",
+ "// GPU run, set to true\n",
+ "sparkSession.conf.set(\"spark.rapids.sql.enabled\", true)\n",
+ "// CPU run, set to false\n",
+ "// sparkSession.conf.set('spark.rapids.sql.enabled', 'false')\n",
+ "// remove config(\"spark.sql.cache.serializer\", \"com.nvidia.spark.ParquetCachedBatchSerializer\") for CPU\n",
+ "sparkSession.conf.set(\"spark.sql.files.maxPartitionBytes\", \"1G\")\n",
+ "sparkSession.conf.set(\"spark.sql.broadcastTimeout\", 700)\n",
+ "sparkSession.conf.set(\"spark.rapids.sql.hasNans\", false)\n",
+ "// use GPU to read CSV\n",
+ "sparkSession.conf.set(\"spark.rapids.sql.csv.read.double.enabled\", true)\n",
+ "\n",
+ "val reader = sparkSession.read.schema(rawSchema)"
]
},
{
@@ -607,7 +761,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 12,
"id": "5bac2301",
"metadata": {},
"outputs": [
@@ -615,8 +769,9 @@
"data": {
"text/plain": [
"optionsMap = Map(header -> true)\n",
+ "rawDf = [reference_pool_id: string, loan_id: bigint ... 107 more fields]\n",
"perfSet = [loan_id: bigint, monthly_reporting_period: string ... 30 more fields]\n",
- "acqSet = [loan_id: bigint, orig_channel: string ... 24 more fields]\n"
+ "acqSet = [loan_id: bigint, orig_channel: string ... 25 more fields]\n"
]
},
"metadata": {},
@@ -625,28 +780,25 @@
{
"data": {
"text/plain": [
- "[loan_id: bigint, orig_channel: string ... 24 more fields]"
+ "[loan_id: bigint, orig_channel: string ... 25 more fields]"
]
},
- "execution_count": 10,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "val optionsMap = Map(\"header\" -> \"true\")\n",
- "val perfSet = reader.options(optionsMap)\n",
+ "val rawDf = reader.option(\"header\", false)\n",
" .option(\"nullValue\", \"\")\n",
" .option(\"delimiter\", \"|\")\n",
" .option(\"parserLib\", \"univocity\")\n",
- " .schema(performanceSchema)\n",
- " .csv(perfPath)\n",
+ " .schema(rawSchema)\n",
+ " .csv(dataPath)\n",
" .withColumn(\"quarter\", GetQuarterFromCsvFileName())\n",
- "val acqSet = reader.options(optionsMap)\n",
- " .option(\"delimiter\", \"|\")\n",
- " .schema(acquisitionSchema)\n",
- " .csv(acqPath)\n",
- " .withColumn(\"quarter\", GetQuarterFromCsvFileName())"
+ "\n",
+ "val perfSet = extractPerfColumns(rawDf)\n",
+ "val acqSet = extractAcqColumns(rawDf)"
]
},
{
@@ -659,7 +811,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 13,
"id": "a16155cb",
"metadata": {},
"outputs": [
@@ -681,7 +833,7 @@
"List(orig_channel, first_home_buyer, loan_purpose, property_type, occupancy_status, property_state, product_type, relocation_mortgage_indicator, seller_name, mod_flag, orig_interest_rate, orig_upb, orig_loan_term, orig_ltv, orig_cltv, num_borrowers, dti, borrower_credit_score, num_units, zip, mortgage_insurance_percent, current_loan_delinquency_status, current_actual_upb, interest_rate, loan_age, msa, non_interest_bearing_upb, delinquency_12)"
]
},
- "execution_count": 11,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -816,7 +968,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 14,
"id": "78b76252",
"metadata": {},
"outputs": [
@@ -859,7 +1011,7 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 15,
"id": "ffdb0a62",
"metadata": {},
"outputs": [
@@ -867,16 +1019,16 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Elapsed time : 35.638s\n"
+ "Elapsed time : 399.241s\n"
]
},
{
"data": {
"text/plain": [
- "t0 = 1654138715501\n",
+ "t0 = 1656695479451\n",
"optionsMap = Map(header -> true)\n",
"rawDF = [orig_channel: int, first_home_buyer: int ... 26 more fields]\n",
- "t1 = 1654138751139\n"
+ "t1 = 1656695878692\n"
]
},
"metadata": {},
@@ -885,42 +1037,47 @@
{
"data": {
"text/plain": [
- "1654138751139"
+ "1656695878692"
]
},
- "execution_count": 13,
+ "execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"val t0 = System.currentTimeMillis\n",
- "val optionsMap = Map(\"header\" -> \"true\")\n",
"val rawDF = transform(\n",
" perfSet,\n",
" acqSet,\n",
" sparkSession\n",
" )\n",
- "rawDF.write.mode(\"overwrite\").parquet(new Path(outPath, \"data\").toString)\n",
+ "\n",
+ "val etlDataPath = new Path(outPath, \"data\").toString\n",
+ "rawDF.write.mode(\"overwrite\").parquet(etlDataPath)\n",
+ "\n",
+ "if(saveTrainEvalDataset == true)\n",
+ "{\n",
+ " val etlDf = sparkSession.read.parquet(etlDataPath)\n",
+ " val sets = etlDf.randomSplit(Array[Double](0.8, 0.2))\n",
+ " val train = sets(0)\n",
+ " val eval = sets(1)\n",
+ " train.write.mode(\"overwrite\").parquet(new Path(outPath, \"train\").toString)\n",
+ " eval.write.mode(\"overwrite\").parquet(new Path(outPath, \"eval\").toString)\n",
+ "}\n",
+ "\n",
+ "\n",
"val t1 = System.currentTimeMillis\n",
"println(\"Elapsed time : \" + ((t1 - t0).toFloat / 1000) + \"s\")\n",
"sparkSession.stop()"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "4388fe96",
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
"kernelspec": {
- "display_name": "XGBoost4j-Spark - Scala",
+ "display_name": "XGBoost4j-Spark_new2 - Scala",
"language": "scala",
- "name": "XGBoost4j-Spark_scala"
+ "name": "xgboost4j-spark_new2_scala"
},
"language_info": {
"codemirror_mode": "text/x-scala",
@@ -933,4 +1090,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
-}
\ No newline at end of file
+}
diff --git a/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-gpu.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-gpu.ipynb
index 9376f0de4..398f2b3e8 100644
--- a/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-gpu.ipynb
+++ b/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-gpu.ipynb
@@ -47,39 +47,15 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "dataRoot = /data\n",
- "trainPath = /data/mortgage/csv/train/\n",
- "evalPath = /data/mortgage/csv/test/\n",
- "transPath = /data/mortgage/csv/test/\n"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/plain": [
- "/data/mortgage/csv/test/"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
- "// You need to update them to your real paths! The input data files can be the output of mortgage-etl jobs, or you can\n",
- "// just use the provided sample datasets upder datasets path. \n",
- "val dataRoot = sys.env.getOrElse(\"DATA_ROOT\", \"/data\")\n",
- "val trainPath = dataRoot + \"/mortgage/csv/train/\"\n",
- "val evalPath = dataRoot + \"/mortgage/csv/test/\"\n",
- "val transPath = dataRoot + \"/mortgage/csv/test/\""
+ "// You need to update them to your real paths! The input data files is the output of mortgage-etl jobs\n",
+ "val dataRoot = sys.env.getOrElse(\"DATA_ROOT\", \"/mortgage\")\n",
+ "val trainPath = dataRoot + \"/train/\"\n",
+ "val evalPath = dataRoot + \"/eval/\"\n",
+ "val transPath = dataRoot + \"/test/\""
]
},
{
@@ -132,7 +108,7 @@
" StructField(\"seller_name\", DoubleType),\n",
" StructField(\"mod_flag\", DoubleType),\n",
" StructField(\"orig_interest_rate\", DoubleType),\n",
- " StructField(\"orig_upb\", IntegerType),\n",
+ " StructField(\"orig_upb\", DoubleType),\n",
" StructField(\"orig_loan_term\", IntegerType),\n",
" StructField(\"orig_ltv\", DoubleType),\n",
" StructField(\"orig_cltv\", DoubleType),\n",
@@ -208,7 +184,7 @@
"source": [
"// Build the spark session and data reader as usual\n",
"val sparkSession = SparkSession.builder.appName(\"mortgage-gpu\").getOrCreate\n",
- "val reader = sparkSession.read.option(\"header\", true).schema(schema)"
+ "val reader = sparkSession.read"
]
},
{
@@ -239,10 +215,9 @@
}
],
"source": [
- "// Please make sure to change the api to reader.parquet if you load parquet files.\n",
- "val trainSet = reader.csv(trainPath)\n",
- "val evalSet = reader.csv(evalPath)\n",
- "val transSet = reader.csv(transPath)"
+ "val trainSet = reader.parquet(trainPath)\n",
+ "val evalSet = reader.parquet(evalPath)\n",
+ "val transSet = reader.parquet(transPath)"
]
},
{
@@ -588,9 +563,9 @@
}
],
"source": [
- "xgbClassificationModel.write.overwrite.save(dataRoot + \"/model/mortgage\")\n",
+ "xgbClassificationModel.write.overwrite.save(dataRoot + \"/model/\")\n",
"\n",
- "val modelFromDisk = XGBoostClassificationModel.load(dataRoot + \"/model/mortgage\")\n",
+ "val modelFromDisk = XGBoostClassificationModel.load(dataRoot + \"/model/\")\n",
"\n",
"val (results2, _) = Benchmark.time(\"transform2\") {\n",
" modelFromDisk.transform(transSet)\n",
@@ -606,13 +581,6 @@
"source": [
"sparkSession.close()"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
@@ -632,4 +600,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
-}
\ No newline at end of file
+}
diff --git a/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage_gpu_crossvalidation.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage_gpu_crossvalidation.ipynb
index 74bb38f9d..d91c74dcf 100644
--- a/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage_gpu_crossvalidation.ipynb
+++ b/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage_gpu_crossvalidation.ipynb
@@ -23,7 +23,7 @@
"import org.apache.spark.sql.SparkSession\n",
"import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\n",
"import org.apache.spark.ml.tuning.{ParamGridBuilder,CrossValidator}\n",
- "import org.apache.spark.sql.types.{FloatType, IntegerType, StructField, StructType}"
+ "import org.apache.spark.sql.types.{FloatType, IntegerType, StructField, StructType, DoubleType}"
]
},
{
@@ -42,36 +42,14 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "dataRoot = /data\n",
- "trainParquetPath = /data/mortgage/parquet/train\n",
- "evalParquetPath = /data/mortgage/parquet/eval\n"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/plain": [
- "/data/mortgage/parquet/eval"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"// You need to update them to your real paths!\n",
- "val dataRoot = sys.env.getOrElse(\"DATA_ROOT\", \"/data\")\n",
- "val trainParquetPath=dataRoot + \"/mortgage/parquet/train\"\n",
- "val evalParquetPath=dataRoot + \"/mortgage/parquet/eval\""
+ "val dataRoot = sys.env.getOrElse(\"DATA_ROOT\", \"/mortgage\")\n",
+ "val trainParquetPath=dataRoot + \"/train\"\n",
+ "val evalParquetPath=dataRoot + \"/eval\""
]
},
{
@@ -83,30 +61,9 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "labelColName = delinquency_12\n",
- "schema = StructType(StructField(orig_channel,FloatType,true), StructField(first_home_buyer,FloatType,true), StructField(loan_purpose,FloatType,true), StructField(property_type,FloatType,true), StructField(occupancy_status,FloatType,true), StructField(property_state,FloatType,true), StructField(product_type,FloatType,true), StructField(relocation_mortgage_indicator,FloatType,true), StructField(seller_name,FloatType,true), StructField(mod_flag,FloatType,true), StructField(orig_interest_rate,FloatType,true), StructField(orig_upb,IntegerType,true), StructField(orig_loan_term,IntegerType,true), StructField(orig_ltv,FloatType,true), StructField(orig_cltv,FloatType,true), StructField(num_borrowers,FloatType,true), Str...\n"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/plain": [
- "StructType(StructField(orig_channel,FloatType,true), StructField(first_home_buyer,FloatType,true), StructField(loan_purpose,FloatType,true), StructField(property_type,FloatType,true), StructField(occupancy_status,FloatType,true), StructField(property_state,FloatType,true), StructField(product_type,FloatType,true), StructField(relocation_mortgage_indicator,FloatType,true), StructField(seller_name,FloatType,true), StructField(mod_flag,FloatType,true), StructField(orig_interest_rate,FloatType,true), StructField(orig_upb,IntegerType,true), StructField(orig_loan_term,IntegerType,true), StructField(orig_ltv,FloatType,true), StructField(orig_cltv,FloatType,true), StructField(num_borrowers,FloatType,true), Str..."
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"val labelColName = \"delinquency_12\"\n",
"val schema = StructType(List(\n",
@@ -121,7 +78,7 @@
" StructField(\"seller_name\", FloatType),\n",
" StructField(\"mod_flag\", FloatType),\n",
" StructField(\"orig_interest_rate\", FloatType),\n",
- " StructField(\"orig_upb\", IntegerType),\n",
+ " StructField(\"orig_upb\", DoubleType),\n",
" StructField(\"orig_loan_term\", IntegerType),\n",
" StructField(\"orig_ltv\", FloatType),\n",
" StructField(\"orig_cltv\", FloatType),\n",
@@ -480,13 +437,6 @@
"source": [
"spark.close()"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
@@ -506,4 +456,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
-}
\ No newline at end of file
+}
diff --git a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/consts.py b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/consts.py
index 7782d84c5..1cca6e6d8 100644
--- a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/consts.py
+++ b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/consts.py
@@ -30,7 +30,7 @@
StructField('seller_name', FloatType()),
StructField('mod_flag', FloatType()),
StructField('orig_interest_rate', FloatType()),
- StructField('orig_upb', IntegerType()),
+ StructField('orig_upb', DoubleType()),
StructField('orig_loan_term', IntegerType()),
StructField('orig_ltv', FloatType()),
StructField('orig_cltv', FloatType()),
@@ -133,67 +133,117 @@
'Other REFINANCE': 'OTHER REFINANCE',
}
-performance_schema = StructType([
- StructField('loan_id', LongType()),
- StructField('monthly_reporting_period', StringType()),
- StructField('servicer', StringType()),
- StructField('interest_rate', DoubleType()),
- StructField('current_actual_upb', DoubleType()),
- StructField('loan_age', DoubleType()),
- StructField('remaining_months_to_legal_maturity', DoubleType()),
- StructField('adj_remaining_months_to_maturity', DoubleType()),
- StructField('maturity_date', StringType()),
- StructField('msa', DoubleType()),
- StructField('current_loan_delinquency_status', IntegerType()),
- StructField('mod_flag', StringType()),
- StructField('zero_balance_code', StringType()),
- StructField('zero_balance_effective_date', StringType()),
- StructField('last_paid_installment_date', StringType()),
- StructField('foreclosed_after', StringType()),
- StructField('disposition_date', StringType()),
- StructField('foreclosure_costs', DoubleType()),
- StructField('prop_preservation_and_repair_costs', DoubleType()),
- StructField('asset_recovery_costs', DoubleType()),
- StructField('misc_holding_expenses', DoubleType()),
- StructField('holding_taxes', DoubleType()),
- StructField('net_sale_proceeds', DoubleType()),
- StructField('credit_enhancement_proceeds', DoubleType()),
- StructField('repurchase_make_whole_proceeds', StringType()),
- StructField('other_foreclosure_proceeds', DoubleType()),
- StructField('non_interest_bearing_upb', DoubleType()),
- StructField('principal_forgiveness_upb', StringType()),
- StructField('repurchase_make_whole_proceeds_flag', StringType()),
- StructField('foreclosure_principal_write_off_amount', StringType()),
- StructField('servicing_activity_indicator', StringType()),
-])
-acquisition_schema = StructType([
- StructField('loan_id', LongType()),
- StructField('orig_channel', StringType()),
- StructField('seller_name', StringType()),
- StructField('orig_interest_rate', DoubleType()),
- StructField('orig_upb', IntegerType()),
- StructField('orig_loan_term', IntegerType()),
- StructField('orig_date', StringType()),
- StructField('first_pay_date', StringType()),
- StructField('orig_ltv', DoubleType()),
- StructField('orig_cltv', DoubleType()),
- StructField('num_borrowers', DoubleType()),
- StructField('dti', DoubleType()),
- StructField('borrower_credit_score', DoubleType()),
- StructField('first_home_buyer', StringType()),
- StructField('loan_purpose', StringType()),
- StructField('property_type', StringType()),
- StructField('num_units', IntegerType()),
- StructField('occupancy_status', StringType()),
- StructField('property_state', StringType()),
- StructField('zip', IntegerType()),
- StructField('mortgage_insurance_percent', DoubleType()),
- StructField('product_type', StringType()),
- StructField('coborrow_credit_score', DoubleType()),
- StructField('mortgage_insurance_type', DoubleType()),
- StructField('relocation_mortgage_indicator', StringType()),
-])
+rawSchema = StructType([
+ StructField("reference_pool_id", StringType()),
+ StructField("loan_id", LongType()),
+ StructField("monthly_reporting_period", StringType()),
+ StructField("orig_channel", StringType()),
+ StructField("seller_name", StringType()),
+ StructField("servicer", StringType()),
+ StructField("master_servicer", StringType()),
+ StructField("orig_interest_rate", DoubleType()),
+ StructField("interest_rate", DoubleType()),
+ StructField("orig_upb", DoubleType()),
+ StructField("upb_at_issuance", StringType()),
+ StructField("current_actual_upb", DoubleType()),
+ StructField("orig_loan_term", IntegerType()),
+ StructField("orig_date", StringType()),
+ StructField("first_pay_date", StringType()),
+ StructField("loan_age", DoubleType()),
+ StructField("remaining_months_to_legal_maturity", DoubleType()),
+ StructField("adj_remaining_months_to_maturity", DoubleType()),
+ StructField("maturity_date", StringType()),
+ StructField("orig_ltv", DoubleType()),
+ StructField("orig_cltv", DoubleType()),
+ StructField("num_borrowers", DoubleType()),
+ StructField("dti", DoubleType()),
+ StructField("borrower_credit_score", DoubleType()),
+ StructField("coborrow_credit_score", DoubleType()),
+ StructField("first_home_buyer", StringType()),
+ StructField("loan_purpose", StringType()),
+ StructField("property_type", StringType()),
+ StructField("num_units", IntegerType()),
+ StructField("occupancy_status", StringType()),
+ StructField("property_state", StringType()),
+ StructField("msa", DoubleType()),
+ StructField("zip", IntegerType()),
+ StructField("mortgage_insurance_percent", DoubleType()),
+ StructField("product_type", StringType()),
+ StructField("prepayment_penalty_indicator", StringType()),
+ StructField("interest_only_loan_indicator", StringType()),
+ StructField("interest_only_first_principal_and_interest_payment_date", StringType()),
+ StructField("months_to_amortization", StringType()),
+ StructField("current_loan_delinquency_status", IntegerType()),
+ StructField("loan_payment_history", StringType()),
+ StructField("mod_flag", StringType()),
+ StructField("mortgage_insurance_cancellation_indicator", StringType()),
+ StructField("zero_balance_code", StringType()),
+ StructField("zero_balance_effective_date", StringType()),
+ StructField("upb_at_the_time_of_removal", StringType()),
+ StructField("repurchase_date", StringType()),
+ StructField("scheduled_principal_current", StringType()),
+ StructField("total_principal_current", StringType()),
+ StructField("unscheduled_principal_current", StringType()),
+ StructField("last_paid_installment_date", StringType()),
+ StructField("foreclosed_after", StringType()),
+ StructField("disposition_date", StringType()),
+ StructField("foreclosure_costs", DoubleType()),
+ StructField("prop_preservation_and_repair_costs", DoubleType()),
+ StructField("asset_recovery_costs", DoubleType()),
+ StructField("misc_holding_expenses", DoubleType()),
+ StructField("holding_taxes", DoubleType()),
+ StructField("net_sale_proceeds", DoubleType()),
+ StructField("credit_enhancement_proceeds", DoubleType()),
+ StructField("repurchase_make_whole_proceeds", StringType()),
+ StructField("other_foreclosure_proceeds", DoubleType()),
+ StructField("non_interest_bearing_upb", DoubleType()),
+ StructField("principal_forgiveness_upb", StringType()),
+ StructField("original_list_start_date", StringType()),
+ StructField("original_list_price", StringType()),
+ StructField("current_list_start_date", StringType()),
+ StructField("current_list_price", StringType()),
+ StructField("borrower_credit_score_at_issuance", StringType()),
+ StructField("co-borrower_credit_score_at_issuance", StringType()),
+ StructField("borrower_credit_score_current", StringType()),
+ StructField("co-Borrower_credit_score_current", StringType()),
+ StructField("mortgage_insurance_type", DoubleType()),
+ StructField("servicing_activity_indicator", StringType()),
+ StructField("current_period_modification_loss_amount", StringType()),
+ StructField("cumulative_modification_loss_amount", StringType()),
+ StructField("current_period_credit_event_net_gain_or_loss", StringType()),
+ StructField("cumulative_credit_event_net_gain_or_loss", StringType()),
+ StructField("homeready_program_indicator", StringType()),
+ StructField("foreclosure_principal_write_off_amount", StringType()),
+ StructField("relocation_mortgage_indicator", StringType()),
+ StructField("zero_balance_code_change_date", StringType()),
+ StructField("loan_holdback_indicator", StringType()),
+ StructField("loan_holdback_effective_date", StringType()),
+ StructField("delinquent_accrued_interest", StringType()),
+ StructField("property_valuation_method", StringType()),
+ StructField("high_balance_loan_indicator", StringType()),
+ StructField("arm_initial_fixed-rate_period_lt_5_yr_indicator", StringType()),
+ StructField("arm_product_type", StringType()),
+ StructField("initial_fixed-rate_period", StringType()),
+ StructField("interest_rate_adjustment_frequency", StringType()),
+ StructField("next_interest_rate_adjustment_date", StringType()),
+ StructField("next_payment_change_date", StringType()),
+ StructField("index", StringType()),
+ StructField("arm_cap_structure", StringType()),
+ StructField("initial_interest_rate_cap_up_percent", StringType()),
+ StructField("periodic_interest_rate_cap_up_percent", StringType()),
+ StructField("lifetime_interest_rate_cap_up_percent", StringType()),
+ StructField("mortgage_margin", StringType()),
+ StructField("arm_balloon_indicator", StringType()),
+ StructField("arm_plan_number", StringType()),
+ StructField("borrower_assistance_plan", StringType()),
+ StructField("hltv_refinance_option_indicator", StringType()),
+ StructField("deal_name", StringType()),
+ StructField("repurchase_make_whole_proceeds_flag", StringType()),
+ StructField("alternative_delinquency_resolution", StringType()),
+ StructField("alternative_delinquency_resolution_count", StringType()),
+ StructField("total_deferral_amount", StringType())
+ ])
categorical_columns = [
'orig_channel',
diff --git a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl.py b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl.py
index eb3f40aef..d1f157833 100644
--- a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl.py
+++ b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl.py
@@ -16,9 +16,10 @@
from com.nvidia.spark.examples.mortgage.consts import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
+from pyspark.sql.window import Window
from sys import exit
-get_quarter = udf(lambda path: path.split(r'.')[0].split('_')[-1], StringType())
+get_quarter = udf(lambda path: path.split(r'.')[0].split('/')[-1], StringType())
standardize_name = udf(lambda name: name_mapping.get(name), StringType())
def load_data(spark, paths, schema, args, extra_csv_opts={}):
@@ -31,18 +32,60 @@ def load_data(spark, paths, schema, args, extra_csv_opts={}):
(reader
.schema(schema)
.option('delimiter', '|')
- .option('header', args.hasHeader))
+ .option('header', False))
for k, v in extra_csv_opts.items():
reader.option(k, v)
return reader.load(paths)
-def prepare_performance(spark, args):
+def prepare_rawDf(spark, args):
extra_csv_options = {
'nullValue': '',
'parserLib': 'univocity',
}
- paths = extract_paths(args.dataPaths, 'perf::')
- performance = (load_data(spark, paths, performance_schema, args, extra_csv_options)
+ paths = extract_paths(args.dataPaths, 'data::')
+ rawDf = load_data(spark, paths, rawSchema, args, extra_csv_options)
+
+ return rawDf
+
+def extract_perf_columns(rawDf):
+ perfDf = rawDf.select(
+ col("loan_id"),
+ date_format(to_date(col("monthly_reporting_period"),"MMyyyy"), "MM/dd/yyyy").alias("monthly_reporting_period"),
+ upper(col("servicer")).alias("servicer"),
+ col("interest_rate"),
+ col("current_actual_upb"),
+ col("loan_age"),
+ col("remaining_months_to_legal_maturity"),
+ col("adj_remaining_months_to_maturity"),
+ date_format(to_date(col("maturity_date"),"MMyyyy"), "MM/yyyy").alias("maturity_date"),
+ col("msa"),
+ col("current_loan_delinquency_status"),
+ col("mod_flag"),
+ col("zero_balance_code"),
+ date_format(to_date(col("zero_balance_effective_date"),"MMyyyy"), "MM/yyyy").alias("zero_balance_effective_date"),
+ date_format(to_date(col("last_paid_installment_date"),"MMyyyy"), "MM/dd/yyyy").alias("last_paid_installment_date"),
+ date_format(to_date(col("foreclosed_after"),"MMyyyy"), "MM/dd/yyyy").alias("foreclosed_after"),
+ date_format(to_date(col("disposition_date"),"MMyyyy"), "MM/dd/yyyy").alias("disposition_date"),
+ col("foreclosure_costs"),
+ col("prop_preservation_and_repair_costs"),
+ col("asset_recovery_costs"),
+ col("misc_holding_expenses"),
+ col("holding_taxes"),
+ col("net_sale_proceeds"),
+ col("credit_enhancement_proceeds"),
+ col("repurchase_make_whole_proceeds"),
+ col("other_foreclosure_proceeds"),
+ col("non_interest_bearing_upb"),
+ col("principal_forgiveness_upb"),
+ col("repurchase_make_whole_proceeds_flag"),
+ col("foreclosure_principal_write_off_amount"),
+ col("servicing_activity_indicator"))
+
+ return perfDf.select("*").filter("current_actual_upb != 0.0")
+
+
+def prepare_performance(spark, args, rawDf):
+ performance = (extract_perf_columns(rawDf)
.withColumn('quarter', get_quarter(input_file_name()))
.withColumn('timestamp', to_date(col('monthly_reporting_period'), 'MM/dd/yyyy'))
.withColumn('timestamp_year', year(col('timestamp')))
@@ -133,8 +176,42 @@ def prepare_performance(spark, args):
.join(to_join, ['quarter', 'loan_id', 'timestamp_year', 'timestamp_month'], 'left')
.drop('timestamp_year', 'timestamp_month'))
-def prepare_acquisition(spark, args):
- return (load_data(spark, extract_paths(args.dataPaths, 'acq::'), acquisition_schema, args)
+def extract_acq_columns(rawDf):
+ acqDf = rawDf.select(
+ col("loan_id"),
+ col("orig_channel"),
+ upper(col("seller_name")).alias("seller_name"),
+ col("orig_interest_rate"),
+ col("orig_upb"),
+ col("orig_loan_term"),
+ date_format(to_date(col("orig_date"),"MMyyyy"), "MM/yyyy").alias("orig_date"),
+ date_format(to_date(col("first_pay_date"),"MMyyyy"), "MM/yyyy").alias("first_pay_date"),
+ col("orig_ltv"),
+ col("orig_cltv"),
+ col("num_borrowers"),
+ col("dti"),
+ col("borrower_credit_score"),
+ col("first_home_buyer"),
+ col("loan_purpose"),
+ col("property_type"),
+ col("num_units"),
+ col("occupancy_status"),
+ col("property_state"),
+ col("zip"),
+ col("mortgage_insurance_percent"),
+ col("product_type"),
+ col("coborrow_credit_score"),
+ col("mortgage_insurance_type"),
+ col("relocation_mortgage_indicator"),
+ dense_rank().over(Window.partitionBy("loan_id").orderBy(to_date(col("monthly_reporting_period"),"MMyyyy"))).alias("rank")
+ )
+
+ return acqDf.select("*").filter(col("rank")==1)
+
+
+
+def prepare_acquisition(spark, args, rawDf):
+ return (extract_acq_columns(rawDf)
.withColumn('quarter', get_quarter(input_file_name()))
.withColumn('seller_name', standardize_name(col('seller_name'))))
@@ -147,8 +224,9 @@ def extract_paths(paths, prefix):
return results
def etl(spark, args):
- performance = prepare_performance(spark, args)
- acquisition = prepare_acquisition(spark, args)
+ rawDf = prepare_rawDf(spark, args)
+ performance = prepare_performance(spark, args, rawDf)
+ acquisition = prepare_acquisition(spark, args, rawDf)
return (performance
.join(acquisition, ['loan_id', 'quarter'], 'left_outer')
.select(
diff --git a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl_main.py b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl_main.py
index 6002f5056..55f5df5fc 100644
--- a/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl_main.py
+++ b/examples/XGBoost-Examples/mortgage/python/com/nvidia/spark/examples/mortgage/etl_main.py
@@ -16,7 +16,6 @@
from com.nvidia.spark.examples.mortgage.consts import *
from com.nvidia.spark.examples.mortgage.etl import etl, extract_paths
from com.nvidia.spark.examples.utility.utils import *
-from ml.dmlc.xgboost4j.scala.spark import *
from pyspark.sql import SparkSession
def main(args, xgboost_args):
diff --git a/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/ETLMain.scala b/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/ETLMain.scala
index f54d3d67c..0b798739b 100644
--- a/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/ETLMain.scala
+++ b/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/ETLMain.scala
@@ -31,17 +31,17 @@ object ETLMain extends Mortgage {
val spark = SparkSession.builder().appName(appInfo.mkString("-")).getOrCreate()
try {
- val (perfPaths, acqPaths, outPath) = checkAndGetPaths(xgbArgs.dataPaths)
+ val (dataPaths, outPath) = checkAndGetPaths(xgbArgs.dataPaths)
println("\n------ Start ETL ------")
benchmark.time("ETL") {
// ETL the raw data
val rawDF = xgbArgs.format match {
- case "csv" => XGBoostETL.csv(spark, perfPaths, acqPaths, xgbArgs.hasHeader)
- case "orc" => XGBoostETL.orc(spark, perfPaths, acqPaths)
- case "parquet" => XGBoostETL.parquet(spark, perfPaths, acqPaths)
+ case "csv" => XGBoostETL.csv(spark, dataPaths, false)
+ case "orc" => XGBoostETL.orc(spark, dataPaths)
+ case "parquet" => XGBoostETL.parquet(spark, dataPaths)
case _ => throw new IllegalArgumentException("Unsupported data file format!")
}
- rawDF.write.mode("overwrite").parquet(new Path(outPath, "data").toString)
+ rawDF.write.mode("overwrite").parquet(outPath)
}
if (xgbArgs.saveDict) {
XGBoostETL.saveDictTable(new Path(outPath, ".dict").toString)
@@ -52,32 +52,26 @@ object ETLMain extends Mortgage {
}
}
- private def checkAndGetPaths(paths: Seq[String]): (Seq[String], Seq[String], String) = {
- val prefixes = Array("perf::", "acq::", "out::")
+ private def checkAndGetPaths(paths: Seq[String]): (Seq[String], String) = {
+ val prefixes = Array("data::", "out::")
val validPaths = paths.filter(_.nonEmpty).map(_.trim)
// get and check perf data paths
- val perfPaths = validPaths.filter(_.startsWith(prefixes.head))
- require(perfPaths.nonEmpty, s"$appName ETL requires at least one path for performance data file." +
- s" Please specify it by '-dataPath=perf::your_perf_path'")
-
- // get and check acq data paths
- val acqPaths = validPaths.filter(_.startsWith(prefixes(1)))
- require(acqPaths.nonEmpty, s"$appName ETL requires at least one path for acquisition data file." +
- s" Please specify it by '-dataPath=acq::your_acq_path'")
+ val dataPaths = validPaths.filter(_.startsWith(prefixes.head))
+ require(dataPaths.nonEmpty, s"$appName ETL requires at least one path for data file." +
+ s" Please specify it by '-dataPath=data::your_data_path'")
// get and check out path
- val outPath = validPaths.filter(_.startsWith(prefixes(2)))
+ val outPath = validPaths.filter(_.startsWith(prefixes(1)))
require(outPath.nonEmpty, s"$appName ETL requires a path to save the ETLed data file. Please specify it" +
" by '-dataPath=out::your_out_path', only the first path is used if multiple paths are found.")
// check data paths not specified type
val unknownPaths = validPaths.filterNot(p => prefixes.exists(p.contains(_)))
require(unknownPaths.isEmpty, s"Unknown type for data path: ${unknownPaths.head}, $appName requires to specify" +
- " the type for each data path by adding the prefix 'perf::' or 'acq::' or 'out::'.")
+ " the type for each data path by adding the prefix 'data::' or 'out::'.")
- (perfPaths.map(_.stripPrefix(prefixes.head)),
- acqPaths.map(_.stripPrefix(prefixes(1))),
- outPath.head.stripPrefix(prefixes(2)))
+ (dataPaths.map(_.stripPrefix(prefixes.head)),
+ outPath.head.stripPrefix(prefixes(1)))
}
}
diff --git a/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/Mortgage.scala b/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/Mortgage.scala
index 582492006..c051cff07 100644
--- a/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/Mortgage.scala
+++ b/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/Mortgage.scala
@@ -16,7 +16,7 @@
package com.nvidia.spark.examples.mortgage
-import org.apache.spark.sql.types.{FloatType, IntegerType, StructField, StructType}
+import org.apache.spark.sql.types.{FloatType, IntegerType, StructField, StructType, DoubleType}
private[mortgage] trait Mortgage {
val appName = "Mortgage"
@@ -37,7 +37,7 @@ private[mortgage] trait Mortgage {
protected val numericCols = List(
("orig_interest_rate", FloatType),
- ("orig_upb", IntegerType),
+ ("orig_upb", DoubleType),
("orig_loan_term", IntegerType),
("orig_ltv", FloatType),
("orig_cltv", FloatType),
diff --git a/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/XGBoostETL.scala b/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/XGBoostETL.scala
index 0ef25ea2e..55f40799d 100644
--- a/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/XGBoostETL.scala
+++ b/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/XGBoostETL.scala
@@ -27,27 +27,64 @@ object GetQuarterFromCsvFileName {
// So we strip off the .txt and everything after it
// and then take everything after the last remaining _
def apply(): Column = substring_index(
- substring_index(input_file_name(), ".", 1), "_", -1)
+ substring_index(input_file_name(), ".", 1), "/", -1)
}
private object CsvReader {
- def readPerformance(spark: SparkSession, paths: Seq[String], optionsMap: Map[String, String]): DataFrame = {
- val performanceSchema = StructType(Array(
+ def readRaw(spark: SparkSession, paths: Seq[String], optionsMap: Map[String, String]): DataFrame = {
+
+ val rawSchema = StructType(Array(
+ StructField("reference_pool_id", StringType),
StructField("loan_id", LongType),
StructField("monthly_reporting_period", StringType),
+ StructField("orig_channel", StringType),
+ StructField("seller_name", StringType),
StructField("servicer", StringType),
+ StructField("master_servicer", StringType),
+ StructField("orig_interest_rate", DoubleType),
StructField("interest_rate", DoubleType),
+ StructField("orig_upb", DoubleType),
+ StructField("upb_at_issuance", StringType),
StructField("current_actual_upb", DoubleType),
+ StructField("orig_loan_term", IntegerType),
+ StructField("orig_date", StringType),
+ StructField("first_pay_date", StringType),
StructField("loan_age", DoubleType),
StructField("remaining_months_to_legal_maturity", DoubleType),
StructField("adj_remaining_months_to_maturity", DoubleType),
StructField("maturity_date", StringType),
+ StructField("orig_ltv", DoubleType),
+ StructField("orig_cltv", DoubleType),
+ StructField("num_borrowers", DoubleType),
+ StructField("dti", DoubleType),
+ StructField("borrower_credit_score", DoubleType),
+ StructField("coborrow_credit_score", DoubleType),
+ StructField("first_home_buyer", StringType),
+ StructField("loan_purpose", StringType),
+ StructField("property_type", StringType),
+ StructField("num_units", IntegerType),
+ StructField("occupancy_status", StringType),
+ StructField("property_state", StringType),
StructField("msa", DoubleType),
+ StructField("zip", IntegerType),
+ StructField("mortgage_insurance_percent", DoubleType),
+ StructField("product_type", StringType),
+ StructField("prepayment_penalty_indicator", StringType),
+ StructField("interest_only_loan_indicator", StringType),
+ StructField("interest_only_first_principal_and_interest_payment_date", StringType),
+ StructField("months_to_amortization", StringType),
StructField("current_loan_delinquency_status", IntegerType),
+ StructField("loan_payment_history", StringType),
StructField("mod_flag", StringType),
+ StructField("mortgage_insurance_cancellation_indicator", StringType),
StructField("zero_balance_code", StringType),
StructField("zero_balance_effective_date", StringType),
+ StructField("upb_at_the_time_of_removal", StringType),
+ StructField("repurchase_date", StringType),
+ StructField("scheduled_principal_current", StringType),
+ StructField("total_principal_current", StringType),
+ StructField("unscheduled_principal_current", StringType),
StructField("last_paid_installment_date", StringType),
StructField("foreclosed_after", StringType),
StructField("disposition_date", StringType),
@@ -62,59 +99,141 @@ private object CsvReader {
StructField("other_foreclosure_proceeds", DoubleType),
StructField("non_interest_bearing_upb", DoubleType),
StructField("principal_forgiveness_upb", StringType),
- StructField("repurchase_make_whole_proceeds_flag", StringType),
+ StructField("original_list_start_date", StringType),
+ StructField("original_list_price", StringType),
+ StructField("current_list_start_date", StringType),
+ StructField("current_list_price", StringType),
+ StructField("borrower_credit_score_at_issuance", StringType),
+ StructField("co-borrower_credit_score_at_issuance", StringType),
+ StructField("borrower_credit_score_current", StringType),
+ StructField("co-Borrower_credit_score_current", StringType),
+ StructField("mortgage_insurance_type", DoubleType),
+ StructField("servicing_activity_indicator", StringType),
+ StructField("current_period_modification_loss_amount", StringType),
+ StructField("cumulative_modification_loss_amount", StringType),
+ StructField("current_period_credit_event_net_gain_or_loss", StringType),
+ StructField("cumulative_credit_event_net_gain_or_loss", StringType),
+ StructField("homeready_program_indicator", StringType),
StructField("foreclosure_principal_write_off_amount", StringType),
- StructField("servicing_activity_indicator", StringType))
+ StructField("relocation_mortgage_indicator", StringType),
+ StructField("zero_balance_code_change_date", StringType),
+ StructField("loan_holdback_indicator", StringType),
+ StructField("loan_holdback_effective_date", StringType),
+ StructField("delinquent_accrued_interest", StringType),
+ StructField("property_valuation_method", StringType),
+ StructField("high_balance_loan_indicator", StringType),
+ StructField("arm_initial_fixed-rate_period_lt_5_yr_indicator", StringType),
+ StructField("arm_product_type", StringType),
+ StructField("initial_fixed-rate_period", StringType),
+ StructField("interest_rate_adjustment_frequency", StringType),
+ StructField("next_interest_rate_adjustment_date", StringType),
+ StructField("next_payment_change_date", StringType),
+ StructField("index", StringType),
+ StructField("arm_cap_structure", StringType),
+ StructField("initial_interest_rate_cap_up_percent", StringType),
+ StructField("periodic_interest_rate_cap_up_percent", StringType),
+ StructField("lifetime_interest_rate_cap_up_percent", StringType),
+ StructField("mortgage_margin", StringType),
+ StructField("arm_balloon_indicator", StringType),
+ StructField("arm_plan_number", StringType),
+ StructField("borrower_assistance_plan", StringType),
+ StructField("hltv_refinance_option_indicator", StringType),
+ StructField("deal_name", StringType),
+ StructField("repurchase_make_whole_proceeds_flag", StringType),
+ StructField("alternative_delinquency_resolution", StringType),
+ StructField("alternative_delinquency_resolution_count", StringType),
+ StructField("total_deferral_amount", StringType)
+ )
)
spark.read
.options(optionsMap)
.option("nullValue", "")
.option("delimiter", "|")
- .option("parserLib", "univocity")
- .schema(performanceSchema)
+ .schema(rawSchema)
.csv(paths: _*)
.withColumn("quarter", GetQuarterFromCsvFileName())
}
+}
- def readAcquisition(spark: SparkSession, paths: Seq[String], optionsMap: Map[String, String]): DataFrame = {
- val acquisitionSchema = StructType(Array(
- StructField("loan_id", LongType),
- StructField("orig_channel", StringType),
- StructField("seller_name", StringType),
- StructField("orig_interest_rate", DoubleType),
- StructField("orig_upb", IntegerType),
- StructField("orig_loan_term", IntegerType),
- StructField("orig_date", StringType),
- StructField("first_pay_date", StringType),
- StructField("orig_ltv", DoubleType),
- StructField("orig_cltv", DoubleType),
- StructField("num_borrowers", DoubleType),
- StructField("dti", DoubleType),
- StructField("borrower_credit_score", DoubleType),
- StructField("first_home_buyer", StringType),
- StructField("loan_purpose", StringType),
- StructField("property_type", StringType),
- StructField("num_units", IntegerType),
- StructField("occupancy_status", StringType),
- StructField("property_state", StringType),
- StructField("zip", IntegerType),
- StructField("mortgage_insurance_percent", DoubleType),
- StructField("product_type", StringType),
- StructField("coborrow_credit_score", DoubleType),
- StructField("mortgage_insurance_type", DoubleType),
- StructField("relocation_mortgage_indicator", StringType))
+object extractPerfColumns{
+ def apply(rawDf : DataFrame) : DataFrame = {
+ val perfDf = rawDf.select(
+ col("loan_id"),
+ date_format(to_date(col("monthly_reporting_period"),"MMyyyy"), "MM/dd/yyyy").as("monthly_reporting_period"),
+ upper(col("servicer")).as("servicer"),
+ col("interest_rate"),
+ col("current_actual_upb"),
+ col("loan_age"),
+ col("remaining_months_to_legal_maturity"),
+ col("adj_remaining_months_to_maturity"),
+ date_format(to_date(col("maturity_date"),"MMyyyy"), "MM/yyyy").as("maturity_date"),
+ col("msa"),
+ col("current_loan_delinquency_status"),
+ col("mod_flag"),
+ col("zero_balance_code"),
+ date_format(to_date(col("zero_balance_effective_date"),"MMyyyy"), "MM/yyyy").as("zero_balance_effective_date"),
+ date_format(to_date(col("last_paid_installment_date"),"MMyyyy"), "MM/dd/yyyy").as("last_paid_installment_date"),
+ date_format(to_date(col("foreclosed_after"),"MMyyyy"), "MM/dd/yyyy").as("foreclosed_after"),
+ date_format(to_date(col("disposition_date"),"MMyyyy"), "MM/dd/yyyy").as("disposition_date"),
+ col("foreclosure_costs"),
+ col("prop_preservation_and_repair_costs"),
+ col("asset_recovery_costs"),
+ col("misc_holding_expenses"),
+ col("holding_taxes"),
+ col("net_sale_proceeds"),
+ col("credit_enhancement_proceeds"),
+ col("repurchase_make_whole_proceeds"),
+ col("other_foreclosure_proceeds"),
+ col("non_interest_bearing_upb"),
+ col("principal_forgiveness_upb"),
+ col("repurchase_make_whole_proceeds_flag"),
+ col("foreclosure_principal_write_off_amount"),
+ col("servicing_activity_indicator"),
+ col("quarter")
)
- spark.read
- .options(optionsMap)
- .option("delimiter", "|")
- .schema(acquisitionSchema)
- .csv(paths: _*)
- .withColumn("quarter", GetQuarterFromCsvFileName())
+ perfDf.select("*").filter("current_actual_upb != 0.0")
}
}
+object extractAcqColumns{
+ def apply(rawDf : DataFrame) : DataFrame = {
+ val acqDf = rawDf.select(
+ col("loan_id"),
+ col("orig_channel"),
+ upper(col("seller_name")).as("seller_name"),
+ col("orig_interest_rate"),
+ col("orig_upb"),
+ col("orig_loan_term"),
+ date_format(to_date(col("orig_date"),"MMyyyy"), "MM/yyyy").as("orig_date"),
+ date_format(to_date(col("first_pay_date"),"MMyyyy"), "MM/yyyy").as("first_pay_date"),
+ col("orig_ltv"),
+ col("orig_cltv"),
+ col("num_borrowers"),
+ col("dti"),
+ col("borrower_credit_score"),
+ col("first_home_buyer"),
+ col("loan_purpose"),
+ col("property_type"),
+ col("num_units"),
+ col("occupancy_status"),
+ col("property_state"),
+ col("zip"),
+ col("mortgage_insurance_percent"),
+ col("product_type"),
+ col("coborrow_credit_score"),
+ col("mortgage_insurance_type"),
+ col("relocation_mortgage_indicator"),
+ col("quarter"),
+ dense_rank().over(Window.partitionBy("loan_id").orderBy(to_date(col("monthly_reporting_period"),"MMyyyy"))).as("rank")
+ )
+
+ acqDf.select("*").filter(col("rank") === 1).drop("rank")
+ }
+
+}
+
object NameMapping {
/**
* Returns a dataframe with two columns named based off of the column names passed in.
@@ -414,27 +533,36 @@ object XGBoostETL extends Mortgage {
}
}
- def csv(spark: SparkSession, perfPaths: Seq[String], acqPaths: Seq[String], hasHeader: Boolean): DataFrame = {
+ def csv(spark: SparkSession, dataPaths: Seq[String], hasHeader: Boolean): DataFrame = {
val optionsMap = Map("header" -> hasHeader.toString)
+ val rawDf = CsvReader.readRaw(spark, dataPaths, optionsMap)
+ val perfDf = extractPerfColumns(rawDf)
+ val acqDf = extractAcqColumns(rawDf)
transform(
- CsvReader.readPerformance(spark, perfPaths, optionsMap),
- CsvReader.readAcquisition(spark, acqPaths, optionsMap),
+ perfDf,
+ acqDf,
spark
)
}
- def parquet(spark: SparkSession, perfPaths: Seq[String], acqPaths: Seq[String]): DataFrame = {
+ def parquet(spark: SparkSession, dataPaths: Seq[String]): DataFrame = {
+ val rawDf = spark.read.parquet(dataPaths: _*)
+ val perfDf = extractPerfColumns(rawDf)
+ val acqDf = extractAcqColumns(rawDf)
transform(
- spark.read.parquet(perfPaths: _*),
- spark.read.parquet(acqPaths: _*),
+ perfDf,
+ acqDf,
spark
)
}
- def orc(spark: SparkSession, perfPaths: Seq[String], acqPaths: Seq[String]): DataFrame = {
+ def orc(spark: SparkSession, dataPaths: Seq[String]): DataFrame = {
+ val rawDf = spark.read.orc(dataPaths: _*)
+ val perfDf = extractPerfColumns(rawDf)
+ val acqDf = extractAcqColumns(rawDf)
transform(
- spark.read.orc(perfPaths: _*),
- spark.read.orc(acqPaths: _*),
+ perfDf,
+ acqDf,
spark
)
}