Skip to content

Commit

Permalink
add jupyter notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
Tianhao-Gu committed May 16, 2024
1 parent 4b18ba4 commit 1e98bb4
Show file tree
Hide file tree
Showing 4 changed files with 147 additions and 2 deletions.
20 changes: 19 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,22 @@
FROM bitnami/spark:3.5.1

RUN export ORI_USER=$(id -u)
# Switch to root to install packages
USER root

ENTRYPOINT ["sleep 10s"]
# Install necessary packages
RUN apt-get update && apt-get install -y \
# Fixing error: psutil could not be installed from sources because gcc is not installed
gcc \
&& rm -rf /var/lib/apt/lists/*

# Install Jupyterlab and other python dependencies
RUN pip install jupyterlab==4.2.0 pyspark==3.5.1

COPY scripts/entrypoint.sh /opt/
RUN chmod a+x /opt/entrypoint.sh

# Switch back to the original user
USER ${ORI_USER}

ENTRYPOINT ["/opt/entrypoint.sh"]
52 changes: 51 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,54 @@
# CDM Jupyterhub dockerfiles (Prototype)

This prototype establishes a Docker container configuration for JupyterHub, designed to furnish a multi-user
environment tailored for executing Spark jobs via Jupyter notebooks.
environment tailored for executing Spark jobs via Jupyter notebooks.

## Using `docker-compose.yaml`

To deploy the JupyterHub container and Spark nodes locally, execute the following command:

```bash
docker-compose up --build
```

## Test Submitting a Spark Job Locally

### Submitting a Spark Job via spark-test-node
```bash
docker exec -it spark-test-node \
sh -c '
/opt/bitnami/spark/bin/spark-submit \
--master spark://spark-master:7077 \
examples/src/main/python/pi.py 10 \
2>/dev/null
'
```

### Submitting a Spark Job via Jupyter Notebook
After launching the [Jupyter Notebook](http://localhost:4041/), establish a Spark context or session with the Spark
master set to the environment variable `SPARK_MASTER_URL` and proceed to submit your job. Once the job is submitted,
you can monitor the job status and logs in the [Spark UI](http://localhost:8080/).

Sample code to calculate Pi using `SparkContext`:
```python
from pyspark import SparkConf, SparkContext
import random
import os

spark_master_url = os.environ['SPARK_MASTER_URL']

conf = SparkConf().setMaster(spark_master_url).setAppName("Pi")
sc = SparkContext(conf=conf)

num_samples = 100000000
def inside(p):
x, y = random.random(), random.random()
return x*x + y*y < 1
count = sc.parallelize(range(0, num_samples)).filter(inside).count()
pi = 4 * count / num_samples
print(pi)
sc.stop()
```



61 changes: 61 additions & 0 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
version: '3'

# This docker-compose is for developer convenience, not for running in production.

services:

spark-master:
image: bitnami/spark:3.5.1
container_name: spark-master
ports:
- "8080:8080"
- "7077:7077"
environment:
- SPARK_MODE=master
- SPARK_MASTER_WEBUI_PORT=8080
- SPARK_MASTER_HOST=0.0.0.0

spark-worker-1:
image: bitnami/spark:3.5.1
container_name: spark-worker-1
ports:
- "8081:8081"
environment:
- SPARK_MODE=worker
- SPARK_MASTER_URL=spark://spark-master:7077
- SPARK_WORKER_CORES=2
- SPARK_WORKER_MEMORY=1G
- SPARK_WORKER_PORT=8091
- SPARK_WORKER_WEBUI_PORT=8081

spark-worker-2:
image: bitnami/spark:3.5.1
container_name: spark-worker-2
ports:
- "8082:8082"
environment:
- SPARK_MODE=worker
- SPARK_MASTER_URL=spark://spark-master:7077
- SPARK_WORKER_CORES=2
- SPARK_WORKER_MEMORY=1G
- SPARK_WORKER_PORT=8092
- SPARK_WORKER_WEBUI_PORT=8082

spark-test-node:
image: bitnami/spark:3.5.1
container_name: spark-test-node
environment:
- SPARK_MASTER_URL=spark://spark-master:7077

notebook:
build:
context: .
dockerfile: Dockerfile
ports:
- "4040:4040"
- "4041:4041"
depends_on:
- spark-master
environment:
- NOTEBOOK_PORT=4041
- SPARK_MASTER_URL=spark://spark-master:7077
16 changes: 16 additions & 0 deletions scripts/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

echo "starting jupyter notebook"

WORKSPACE_DIR="/cdm_shared_workspace"
mkdir -p "$WORKSPACE_DIR"
cd "$WORKSPACE_DIR"

# Start Jupyter Lab
jupyter lab --ip=0.0.0.0 \
--port=$NOTEBOOK_PORT \
--no-browser \
--allow-root \
--notebook-dir="$WORKSPACE_DIR" \
--ServerApp.token='' \
--ServerApp.password=''

0 comments on commit 1e98bb4

Please sign in to comment.