From 1e98bb46a2b3d8cb71d087a559a210d4ea14dcfa Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Wed, 15 May 2024 19:49:51 -0500 Subject: [PATCH 01/10] add jupyter notebook --- Dockerfile | 20 +++++++++++++- README.md | 52 +++++++++++++++++++++++++++++++++++- docker-compose.yaml | 61 +++++++++++++++++++++++++++++++++++++++++++ scripts/entrypoint.sh | 16 ++++++++++++ 4 files changed, 147 insertions(+), 2 deletions(-) create mode 100644 docker-compose.yaml create mode 100644 scripts/entrypoint.sh diff --git a/Dockerfile b/Dockerfile index 8233426..4dd24e4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,22 @@ FROM bitnami/spark:3.5.1 +RUN export ORI_USER=$(id -u) +# Switch to root to install packages +USER root -ENTRYPOINT ["sleep 10s"] +# Install necessary packages +RUN apt-get update && apt-get install -y \ + # Fixing error: psutil could not be installed from sources because gcc is not installed + gcc \ + && rm -rf /var/lib/apt/lists/* + +# Install Jupyterlab and other python dependencies +RUN pip install jupyterlab==4.2.0 pyspark==3.5.1 + +COPY scripts/entrypoint.sh /opt/ +RUN chmod a+x /opt/entrypoint.sh + +# Switch back to the original user +USER ${ORI_USER} + +ENTRYPOINT ["/opt/entrypoint.sh"] diff --git a/README.md b/README.md index ce88440..cd3a632 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,54 @@ # CDM Jupyterhub dockerfiles (Prototype) This prototype establishes a Docker container configuration for JupyterHub, designed to furnish a multi-user -environment tailored for executing Spark jobs via Jupyter notebooks. \ No newline at end of file +environment tailored for executing Spark jobs via Jupyter notebooks. + +## Using `docker-compose.yaml` + +To deploy the JupyterHub container and Spark nodes locally, execute the following command: + +```bash +docker-compose up --build +``` + +## Test Submitting a Spark Job Locally + +### Submitting a Spark Job via spark-test-node +```bash +docker exec -it spark-test-node \ + sh -c ' + /opt/bitnami/spark/bin/spark-submit \ + --master spark://spark-master:7077 \ + examples/src/main/python/pi.py 10 \ + 2>/dev/null + ' +``` + +### Submitting a Spark Job via Jupyter Notebook +After launching the [Jupyter Notebook](http://localhost:4041/), establish a Spark context or session with the Spark +master set to the environment variable `SPARK_MASTER_URL` and proceed to submit your job. Once the job is submitted, +you can monitor the job status and logs in the [Spark UI](http://localhost:8080/). + +Sample code to calculate Pi using `SparkContext`: +```python +from pyspark import SparkConf, SparkContext +import random +import os + +spark_master_url = os.environ['SPARK_MASTER_URL'] + +conf = SparkConf().setMaster(spark_master_url).setAppName("Pi") +sc = SparkContext(conf=conf) + +num_samples = 100000000 +def inside(p): + x, y = random.random(), random.random() + return x*x + y*y < 1 +count = sc.parallelize(range(0, num_samples)).filter(inside).count() +pi = 4 * count / num_samples +print(pi) +sc.stop() +``` + + + diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..ce039bf --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,61 @@ +version: '3' + +# This docker-compose is for developer convenience, not for running in production. + +services: + + spark-master: + image: bitnami/spark:3.5.1 + container_name: spark-master + ports: + - "8080:8080" + - "7077:7077" + environment: + - SPARK_MODE=master + - SPARK_MASTER_WEBUI_PORT=8080 + - SPARK_MASTER_HOST=0.0.0.0 + + spark-worker-1: + image: bitnami/spark:3.5.1 + container_name: spark-worker-1 + ports: + - "8081:8081" + environment: + - SPARK_MODE=worker + - SPARK_MASTER_URL=spark://spark-master:7077 + - SPARK_WORKER_CORES=2 + - SPARK_WORKER_MEMORY=1G + - SPARK_WORKER_PORT=8091 + - SPARK_WORKER_WEBUI_PORT=8081 + + spark-worker-2: + image: bitnami/spark:3.5.1 + container_name: spark-worker-2 + ports: + - "8082:8082" + environment: + - SPARK_MODE=worker + - SPARK_MASTER_URL=spark://spark-master:7077 + - SPARK_WORKER_CORES=2 + - SPARK_WORKER_MEMORY=1G + - SPARK_WORKER_PORT=8092 + - SPARK_WORKER_WEBUI_PORT=8082 + + spark-test-node: + image: bitnami/spark:3.5.1 + container_name: spark-test-node + environment: + - SPARK_MASTER_URL=spark://spark-master:7077 + + notebook: + build: + context: . + dockerfile: Dockerfile + ports: + - "4040:4040" + - "4041:4041" + depends_on: + - spark-master + environment: + - NOTEBOOK_PORT=4041 + - SPARK_MASTER_URL=spark://spark-master:7077 \ No newline at end of file diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh new file mode 100644 index 0000000..3a70178 --- /dev/null +++ b/scripts/entrypoint.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +echo "starting jupyter notebook" + +WORKSPACE_DIR="/cdm_shared_workspace" +mkdir -p "$WORKSPACE_DIR" +cd "$WORKSPACE_DIR" + +# Start Jupyter Lab +jupyter lab --ip=0.0.0.0 \ + --port=$NOTEBOOK_PORT \ + --no-browser \ + --allow-root \ + --notebook-dir="$WORKSPACE_DIR" \ + --ServerApp.token='' \ + --ServerApp.password='' \ No newline at end of file From a1047ed3ba47b4011ae38488081ecee8d9567b65 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 16 May 2024 08:20:52 -0500 Subject: [PATCH 02/10] add container name for notebook --- Dockerfile | 7 ++++--- docker-compose.yaml | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 4dd24e4..426c44b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,14 +4,15 @@ RUN export ORI_USER=$(id -u) # Switch to root to install packages USER root +ENV PYTHON_VER=python3.11 + # Install necessary packages RUN apt-get update && apt-get install -y \ - # Fixing error: psutil could not be installed from sources because gcc is not installed - gcc \ + $PYTHON_VER python3-pip $PYTHON_VER-dev \ && rm -rf /var/lib/apt/lists/* # Install Jupyterlab and other python dependencies -RUN pip install jupyterlab==4.2.0 pyspark==3.5.1 +RUN pip3 install jupyterlab==4.2.0 pyspark==3.5.1 COPY scripts/entrypoint.sh /opt/ RUN chmod a+x /opt/entrypoint.sh diff --git a/docker-compose.yaml b/docker-compose.yaml index ce039bf..fd7c453 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -51,6 +51,7 @@ services: build: context: . dockerfile: Dockerfile + container_name: spark-notebook ports: - "4040:4040" - "4041:4041" From 7df710ea7b64ee9cabd7af71a00820ead359f08f Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 16 May 2024 08:37:57 -0500 Subject: [PATCH 03/10] add instruction for Rancher deployment --- README.md | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/README.md b/README.md index cd3a632..82c698f 100644 --- a/README.md +++ b/README.md @@ -50,5 +50,45 @@ print(pi) sc.stop() ``` +## Racher Deployment + +### Environment Variables +- `SPARK_MASTER_URL`: `spark://spark-master:7077` +- `NOTEBOOK_PORT`: 4041 +- `SPARK_DRIVER_HOST`: `notebook` (the hostname of the Jupyter notebook container). +- `SPARK_DRIVER_PORT`: 7075 +- `SPARK_BLOCKMANAGER_PORT`: 7076 + +### Spark Session/Context Configuration + +Ensure to configure `spark.driver.host` for the Spark driver to bind to the Jupyter notebook container's hostname + +```python +spark = SparkSession.builder \ + .master(os.environ['SPARK_MASTER_URL']) \ + .appName("TestSparkJob") \ + .config("spark.driver.host", os.environ['SPARK_DRIVER_HOST']) \ + .getOrCreate() +``` +```python +conf = SparkConf(). \ + setMaster( os.environ['SPARK_MASTER_URL']). \ + setAppName("TestSparkJob"). \ + set("spark.driver.host", os.environ['SPARK_DRIVER_HOST']) +sc = SparkContext(conf=conf) +``` + +```bash +/opt/bitnami/spark/bin/spark-submit \ + --master $SPARK_MASTER_URL \ + --conf spark.driver.host=$SPARK_DRIVER_HOST \ + /opt/bitnami/spark/examples/src/main/python/pi.py 10 \ + 2>/dev/null +``` + + + + + From 68d386a968d1c0d8fb3adeeeee53a94e310fa776 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 16 May 2024 09:09:14 -0500 Subject: [PATCH 04/10] update rancher env variable instruction --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 82c698f..17a20cc 100644 --- a/README.md +++ b/README.md @@ -56,8 +56,6 @@ sc.stop() - `SPARK_MASTER_URL`: `spark://spark-master:7077` - `NOTEBOOK_PORT`: 4041 - `SPARK_DRIVER_HOST`: `notebook` (the hostname of the Jupyter notebook container). -- `SPARK_DRIVER_PORT`: 7075 -- `SPARK_BLOCKMANAGER_PORT`: 7076 ### Spark Session/Context Configuration From 8e2823596f42b28dd2bdc2e1b584d7202e84d177 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 16 May 2024 13:39:30 -0500 Subject: [PATCH 05/10] using env var --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 17a20cc..df2d702 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ docker-compose up --build docker exec -it spark-test-node \ sh -c ' /opt/bitnami/spark/bin/spark-submit \ - --master spark://spark-master:7077 \ + --master $SPARK_MASTER_URL \ examples/src/main/python/pi.py 10 \ 2>/dev/null ' From 6bfee7e4c4fa3bae3d841489904dd20f3f85bcdd Mon Sep 17 00:00:00 2001 From: Tianhao Gu Date: Thu, 16 May 2024 14:05:55 -0500 Subject: [PATCH 06/10] Update docker-compose.yaml Co-authored-by: MrCreosote --- docker-compose.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker-compose.yaml b/docker-compose.yaml index fd7c453..84b6f77 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -44,6 +44,8 @@ services: spark-test-node: image: bitnami/spark:3.5.1 container_name: spark-test-node + depends_on: + - spark-master environment: - SPARK_MASTER_URL=spark://spark-master:7077 From 7d5e88b6082b55e4369f04cc980920be40cf6e16 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 16 May 2024 14:09:35 -0500 Subject: [PATCH 07/10] remove 4040 port for notebook --- Dockerfile | 2 +- docker-compose.yaml | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 426c44b..c650dce 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,6 +18,6 @@ COPY scripts/entrypoint.sh /opt/ RUN chmod a+x /opt/entrypoint.sh # Switch back to the original user -USER ${ORI_USER} +#USER ${ORI_USER} ENTRYPOINT ["/opt/entrypoint.sh"] diff --git a/docker-compose.yaml b/docker-compose.yaml index 84b6f77..9e682c8 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -18,6 +18,8 @@ services: spark-worker-1: image: bitnami/spark:3.5.1 container_name: spark-worker-1 + depends_on: + - spark-master ports: - "8081:8081" environment: @@ -25,12 +27,13 @@ services: - SPARK_MASTER_URL=spark://spark-master:7077 - SPARK_WORKER_CORES=2 - SPARK_WORKER_MEMORY=1G - - SPARK_WORKER_PORT=8091 - SPARK_WORKER_WEBUI_PORT=8081 spark-worker-2: image: bitnami/spark:3.5.1 container_name: spark-worker-2 + depends_on: + - spark-master ports: - "8082:8082" environment: @@ -38,7 +41,6 @@ services: - SPARK_MASTER_URL=spark://spark-master:7077 - SPARK_WORKER_CORES=2 - SPARK_WORKER_MEMORY=1G - - SPARK_WORKER_PORT=8092 - SPARK_WORKER_WEBUI_PORT=8082 spark-test-node: @@ -55,7 +57,6 @@ services: dockerfile: Dockerfile container_name: spark-notebook ports: - - "4040:4040" - "4041:4041" depends_on: - spark-master From a13d27f34846d1f2776a5735739d1037b2df729f Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 16 May 2024 14:17:12 -0500 Subject: [PATCH 08/10] update readme --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index df2d702..1876d59 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,7 @@ spark = SparkSession.builder \ .config("spark.driver.host", os.environ['SPARK_DRIVER_HOST']) \ .getOrCreate() ``` +Or ```python conf = SparkConf(). \ setMaster( os.environ['SPARK_MASTER_URL']). \ @@ -76,6 +77,7 @@ conf = SparkConf(). \ sc = SparkContext(conf=conf) ``` +Submitting job using terminal ```bash /opt/bitnami/spark/bin/spark-submit \ --master $SPARK_MASTER_URL \ From 908f3404769f26466dd6cdf91a91f13aa1d617be Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 16 May 2024 14:30:01 -0500 Subject: [PATCH 09/10] uncomment using ori user --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index c650dce..426c44b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,6 +18,6 @@ COPY scripts/entrypoint.sh /opt/ RUN chmod a+x /opt/entrypoint.sh # Switch back to the original user -#USER ${ORI_USER} +USER ${ORI_USER} ENTRYPOINT ["/opt/entrypoint.sh"] From f9754a1267b095cb65b1e903c93429ed9480f2bb Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 16 May 2024 15:07:49 -0500 Subject: [PATCH 10/10] using image python --- Dockerfile | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 426c44b..f28f2cc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,11 +4,9 @@ RUN export ORI_USER=$(id -u) # Switch to root to install packages USER root -ENV PYTHON_VER=python3.11 - -# Install necessary packages RUN apt-get update && apt-get install -y \ - $PYTHON_VER python3-pip $PYTHON_VER-dev \ + # GCC required to resolve error during JupyterLab installation: psutil could not be installed from sources because gcc is not installed. + gcc \ && rm -rf /var/lib/apt/lists/* # Install Jupyterlab and other python dependencies