diff --git a/Dockerfile b/Dockerfile index 8233426..4dd24e4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,22 @@ FROM bitnami/spark:3.5.1 +RUN export ORI_USER=$(id -u) +# Switch to root to install packages +USER root -ENTRYPOINT ["sleep 10s"] +# Install necessary packages +RUN apt-get update && apt-get install -y \ + # Fixing error: psutil could not be installed from sources because gcc is not installed + gcc \ + && rm -rf /var/lib/apt/lists/* + +# Install Jupyterlab and other python dependencies +RUN pip install jupyterlab==4.2.0 pyspark==3.5.1 + +COPY scripts/entrypoint.sh /opt/ +RUN chmod a+x /opt/entrypoint.sh + +# Switch back to the original user +USER ${ORI_USER} + +ENTRYPOINT ["/opt/entrypoint.sh"] diff --git a/README.md b/README.md index ce88440..cd3a632 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,54 @@ # CDM Jupyterhub dockerfiles (Prototype) This prototype establishes a Docker container configuration for JupyterHub, designed to furnish a multi-user -environment tailored for executing Spark jobs via Jupyter notebooks. \ No newline at end of file +environment tailored for executing Spark jobs via Jupyter notebooks. + +## Using `docker-compose.yaml` + +To deploy the JupyterHub container and Spark nodes locally, execute the following command: + +```bash +docker-compose up --build +``` + +## Test Submitting a Spark Job Locally + +### Submitting a Spark Job via spark-test-node +```bash +docker exec -it spark-test-node \ + sh -c ' + /opt/bitnami/spark/bin/spark-submit \ + --master spark://spark-master:7077 \ + examples/src/main/python/pi.py 10 \ + 2>/dev/null + ' +``` + +### Submitting a Spark Job via Jupyter Notebook +After launching the [Jupyter Notebook](http://localhost:4041/), establish a Spark context or session with the Spark +master set to the environment variable `SPARK_MASTER_URL` and proceed to submit your job. Once the job is submitted, +you can monitor the job status and logs in the [Spark UI](http://localhost:8080/). + +Sample code to calculate Pi using `SparkContext`: +```python +from pyspark import SparkConf, SparkContext +import random +import os + +spark_master_url = os.environ['SPARK_MASTER_URL'] + +conf = SparkConf().setMaster(spark_master_url).setAppName("Pi") +sc = SparkContext(conf=conf) + +num_samples = 100000000 +def inside(p): + x, y = random.random(), random.random() + return x*x + y*y < 1 +count = sc.parallelize(range(0, num_samples)).filter(inside).count() +pi = 4 * count / num_samples +print(pi) +sc.stop() +``` + + + diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..ce039bf --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,61 @@ +version: '3' + +# This docker-compose is for developer convenience, not for running in production. + +services: + + spark-master: + image: bitnami/spark:3.5.1 + container_name: spark-master + ports: + - "8080:8080" + - "7077:7077" + environment: + - SPARK_MODE=master + - SPARK_MASTER_WEBUI_PORT=8080 + - SPARK_MASTER_HOST=0.0.0.0 + + spark-worker-1: + image: bitnami/spark:3.5.1 + container_name: spark-worker-1 + ports: + - "8081:8081" + environment: + - SPARK_MODE=worker + - SPARK_MASTER_URL=spark://spark-master:7077 + - SPARK_WORKER_CORES=2 + - SPARK_WORKER_MEMORY=1G + - SPARK_WORKER_PORT=8091 + - SPARK_WORKER_WEBUI_PORT=8081 + + spark-worker-2: + image: bitnami/spark:3.5.1 + container_name: spark-worker-2 + ports: + - "8082:8082" + environment: + - SPARK_MODE=worker + - SPARK_MASTER_URL=spark://spark-master:7077 + - SPARK_WORKER_CORES=2 + - SPARK_WORKER_MEMORY=1G + - SPARK_WORKER_PORT=8092 + - SPARK_WORKER_WEBUI_PORT=8082 + + spark-test-node: + image: bitnami/spark:3.5.1 + container_name: spark-test-node + environment: + - SPARK_MASTER_URL=spark://spark-master:7077 + + notebook: + build: + context: . + dockerfile: Dockerfile + ports: + - "4040:4040" + - "4041:4041" + depends_on: + - spark-master + environment: + - NOTEBOOK_PORT=4041 + - SPARK_MASTER_URL=spark://spark-master:7077 \ No newline at end of file diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh new file mode 100644 index 0000000..3a70178 --- /dev/null +++ b/scripts/entrypoint.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +echo "starting jupyter notebook" + +WORKSPACE_DIR="/cdm_shared_workspace" +mkdir -p "$WORKSPACE_DIR" +cd "$WORKSPACE_DIR" + +# Start Jupyter Lab +jupyter lab --ip=0.0.0.0 \ + --port=$NOTEBOOK_PORT \ + --no-browser \ + --allow-root \ + --notebook-dir="$WORKSPACE_DIR" \ + --ServerApp.token='' \ + --ServerApp.password='' \ No newline at end of file