From cced8926117e11f165bf2dbc3de46c74be05f2b6 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 16 May 2024 15:46:57 -0500 Subject: [PATCH 1/3] add driver_host to spark config --- docker-compose.yaml | 3 ++- scripts/entrypoint.sh | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/docker-compose.yaml b/docker-compose.yaml index 9e682c8..3dd4717 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -62,4 +62,5 @@ services: - spark-master environment: - NOTEBOOK_PORT=4041 - - SPARK_MASTER_URL=spark://spark-master:7077 \ No newline at end of file + - SPARK_MASTER_URL=spark://spark-master:7077 + - SPARK_DRIVER_HOST=spark-notebook \ No newline at end of file diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh index 3a70178..277aeba 100644 --- a/scripts/entrypoint.sh +++ b/scripts/entrypoint.sh @@ -2,6 +2,16 @@ echo "starting jupyter notebook" +if [ -n "$SPARK_DRIVER_HOST" ]; then + echo "Setting spark.driver.host to $SPARK_DRIVER_HOST" + source /opt/bitnami/scripts/spark-env.sh + if [ -z "$SPARK_CONF_FILE" ]; then + echo "Error: unable to find SPARK_CONF_FILE path" + exit 1 + fi + echo "spark.driver.host $SPARK_DRIVER_HOST" >> $SPARK_CONF_FILE +fi + WORKSPACE_DIR="/cdm_shared_workspace" mkdir -p "$WORKSPACE_DIR" cd "$WORKSPACE_DIR" From 10b9338e60b668fba24a759e56d31baf0096b842 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 16 May 2024 16:20:22 -0500 Subject: [PATCH 2/3] update readme --- README.md | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 1876d59..181134f 100644 --- a/README.md +++ b/README.md @@ -59,29 +59,26 @@ sc.stop() ### Spark Session/Context Configuration -Ensure to configure `spark.driver.host` for the Spark driver to bind to the Jupyter notebook container's hostname +When running Spark in the Jupyter notebook container, the `spark.driver.host` configuration is automatically set to +the hostname (`SPARK_DRIVER_HOST`) of the container. +#### Example SparkSession Configuration ```python spark = SparkSession.builder \ - .master(os.environ['SPARK_MASTER_URL']) \ .appName("TestSparkJob") \ - .config("spark.driver.host", os.environ['SPARK_DRIVER_HOST']) \ .getOrCreate() ``` -Or + +#### Example SparkContext Configuration ```python conf = SparkConf(). \ - setMaster( os.environ['SPARK_MASTER_URL']). \ - setAppName("TestSparkJob"). \ - set("spark.driver.host", os.environ['SPARK_DRIVER_HOST']) + setAppName("TestSparkJob") sc = SparkContext(conf=conf) ``` -Submitting job using terminal +#### Submitting a Job Using Terminal ```bash /opt/bitnami/spark/bin/spark-submit \ - --master $SPARK_MASTER_URL \ - --conf spark.driver.host=$SPARK_DRIVER_HOST \ /opt/bitnami/spark/examples/src/main/python/pi.py 10 \ 2>/dev/null ``` From efb94e4b3bb59a81c53101d0b59d50cead5c2c56 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 16 May 2024 16:31:20 -0500 Subject: [PATCH 3/3] update readme --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 181134f..af10570 100644 --- a/README.md +++ b/README.md @@ -59,8 +59,9 @@ sc.stop() ### Spark Session/Context Configuration -When running Spark in the Jupyter notebook container, the `spark.driver.host` configuration is automatically set to -the hostname (`SPARK_DRIVER_HOST`) of the container. +When running Spark in the Jupyter notebook container, the default `spark.driver.host` configuration is set to +the hostname (`SPARK_DRIVER_HOST`) of the container. +In addition, the environment variable `SPARK_MASTER_URL` should also be configured. #### Example SparkSession Configuration ```python