Skip to content

Commit

Permalink
Merge pull request #77 from kbase/dev_jupyterhub
Browse files Browse the repository at this point in the history
add JUPYTER_MODE for JupyterHub
  • Loading branch information
Tianhao-Gu authored Sep 3, 2024
2 parents 886ab19 + 2ecedd6 commit a208fa7
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 23 deletions.
16 changes: 14 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ RUN groupadd -r spark && useradd -r -g spark spark_user

RUN apt-get update && apt-get install -y \
# GCC required to resolve error during JupyterLab installation: psutil could not be installed from sources because gcc is not installed.
gcc curl git graphviz graphviz-dev libgdal-dev build-essential python3-dev\
gcc curl git npm nodejs graphviz graphviz-dev libgdal-dev build-essential python3-dev\
&& rm -rf /var/lib/apt/lists/*

ENV HADOOP_AWS_VER=3.3.4
Expand Down Expand Up @@ -42,13 +42,25 @@ RUN pipenv sync --system

RUN chown -R spark_user:spark /opt/bitnami

# Set up Jupyter directories
# Set up Jupyter Lab directories
ENV JUPYTER_CONFIG_DIR=/.jupyter
ENV JUPYTER_RUNTIME_DIR=/.jupyter/runtime
ENV JUPYTER_DATA_DIR=/.jupyter/data
RUN mkdir -p ${JUPYTER_CONFIG_DIR} ${JUPYTER_RUNTIME_DIR} ${JUPYTER_DATA_DIR}
RUN chown -R spark_user:spark /.jupyter

# Set up Jupyter Hub directories
ENV JUPYTERHUB_CONFIG_DIR=/srv/jupyterhub
RUN mkdir -p ${JUPYTERHUB_CONFIG_DIR}
COPY ./src/notebook_utils/startup.py ${JUPYTERHUB_CONFIG_DIR}/startup.py
RUN chown -R spark_user:spark ${JUPYTERHUB_CONFIG_DIR}

# Jupyter Hub user home directory
RUN mkdir -p /jupyterhub/users_home
RUN chown -R spark_user:spark /jupyterhub/users_home

RUN npm install -g configurable-http-proxy

COPY ./src/ /src
ENV PYTHONPATH "${PYTHONPATH}:/src"

Expand Down
75 changes: 68 additions & 7 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -123,21 +123,22 @@ services:
- ./config/yarn-write-policy.json:/config/yarn-write-policy.json
- ./scripts/minio_create_bucket_entrypoint.sh:/scripts/minio_create_bucket_entrypoint.sh

dev_notebook:
dev_jupyterlab:
build:
context: .
dockerfile: Dockerfile
container_name: spark-dev-notebook
container_name: dev-jupyterlab
ports:
- "4041:4041"
depends_on:
- spark-master
- minio-create-bucket
environment:
- NOTEBOOK_PORT=4041
- JUPYTER_MODE=jupyterlab
- YARN_RESOURCE_MANAGER_URL=http://yarn-resourcemanager:8032
- SPARK_MASTER_URL=spark://spark-master:7077
- SPARK_DRIVER_HOST=spark-dev-notebook
- SPARK_DRIVER_HOST=dev-jupyterlab
- MINIO_URL=http://minio:9002
- MINIO_ACCESS_KEY=minio-readwrite
- MINIO_SECRET_KEY=minio123
Expand All @@ -151,34 +152,94 @@ services:
volumes:
- ./cdr/cdm/jupyter:/cdm_shared_workspace

user_notebook:
user-jupyterlab:
build:
context: .
dockerfile: Dockerfile
container_name: spark-user-notebook
container_name: user-jupyterlab
ports:
- "4042:4042"
depends_on:
- spark-master
- minio-create-bucket
environment:
- NOTEBOOK_PORT=4042
- JUPYTER_MODE=jupyterlab
- YARN_RESOURCE_MANAGER_URL=http://yarn-resourcemanager:8032
- SPARK_MASTER_URL=spark://spark-master:7077
- SPARK_DRIVER_HOST=spark-user-notebook
- SPARK_DRIVER_HOST=user-jupyterlab
- MINIO_URL=http://minio:9002
- MINIO_ACCESS_KEY=minio-readonly
- MINIO_SECRET_KEY=minio123
- S3_YARN_BUCKET=yarn
- MAX_EXECUTORS=4
# TODO: create postgres user w/ only write access to the hive tables
# TODO: create postgres user r/ only read access to the hive tables
- POSTGRES_USER=hive
- POSTGRES_PASSWORD=hivepassword
- POSTGRES_DB=hive
- POSTGRES_URL=postgres:5432
volumes:
- ./cdr/cdm/jupyter/user_shared_workspace:/cdm_shared_workspace/user_shared_workspace

dev_jupyterhub:
build:
context: .
dockerfile: Dockerfile
container_name: dev-jupyterhub
ports:
- "4043:4043"
depends_on:
- spark-master
- minio-create-bucket
environment:
- NOTEBOOK_PORT=4043
- JUPYTER_MODE=jupyterhub
- YARN_RESOURCE_MANAGER_URL=http://yarn-resourcemanager:8032
- SPARK_MASTER_URL=spark://spark-master:7077
- SPARK_DRIVER_HOST=dev-jupterhub
- MINIO_URL=http://minio:9002
- MINIO_ACCESS_KEY=minio-readwrite
- MINIO_SECRET_KEY=minio123
- S3_YARN_BUCKET=yarn
- MAX_EXECUTORS=4
- POSTGRES_USER=hive
- POSTGRES_PASSWORD=hivepassword
- POSTGRES_DB=hive
- POSTGRES_URL=postgres:5432
- USAGE_MODE=dev
volumes:
- ./cdr/cdm/jupyter:/cdm_shared_workspace
- ./cdr/cdm/jupyter/jupyterhub/users_home:/jupyterhub/users_home

user_jupyterhub:
build:
context: .
dockerfile: Dockerfile
container_name: user-jupyterhub
ports:
- "4044:4044"
depends_on:
- spark-master
- minio-create-bucket
environment:
- NOTEBOOK_PORT=4044
- JUPYTER_MODE=jupyterhub
- YARN_RESOURCE_MANAGER_URL=http://yarn-resourcemanager:8032
- SPARK_MASTER_URL=spark://spark-master:7077
- SPARK_DRIVER_HOST=user-jupyterhub
- MINIO_URL=http://minio:9002
- MINIO_ACCESS_KEY=minio-readonly
- MINIO_SECRET_KEY=minio123
- S3_YARN_BUCKET=yarn
- JUPYTER_MODE=jupyterhub
- MAX_EXECUTORS=4
- POSTGRES_USER=hive
- POSTGRES_PASSWORD=hivepassword
- POSTGRES_DB=hive
- POSTGRES_URL=postgres:5432
volumes:
- ./cdr/cdm/jupyter/jupyterhub/users_home:/jupyterhub/users_home

postgres:
image: postgres:16.3
restart: always
Expand Down
37 changes: 23 additions & 14 deletions scripts/notebook_entrypoint.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
#!/bin/bash

echo "starting jupyter notebook"

# Ensure NOTEBOOK_DIR is set
if [ -z "$NOTEBOOK_DIR" ]; then
echo "ERROR: NOTEBOOK_DIR is not set. Please run setup.sh first."
Expand All @@ -10,17 +8,28 @@ fi

mkdir -p "$NOTEBOOK_DIR" && cd "$NOTEBOOK_DIR"

# install Plotly extension
jupyter labextension install [email protected]

# install ipywidgets extension
jupyter labextension install @jupyter-widgets/[email protected]
if [ "$JUPYTER_MODE" = "jupyterlab" ]; then
echo "starting jupyterlab"
# install Plotly extension
jupyter labextension install [email protected]

# install ipywidgets extension
jupyter labextension install @jupyter-widgets/[email protected]

# Start Jupyter Lab
jupyter lab --ip=0.0.0.0 \
--port="$NOTEBOOK_PORT" \
--no-browser \
--allow-root \
--notebook-dir="$NOTEBOOK_DIR" \
--ServerApp.token='' \
--ServerApp.password=''
elif [ "$JUPYTER_MODE" = "jupyterhub" ]; then
echo "starting jupyterhub"

# Start Jupyter Lab
jupyter lab --ip=0.0.0.0 \
--port="$NOTEBOOK_PORT" \
--no-browser \
--allow-root \
--notebook-dir="$NOTEBOOK_DIR" \
--ServerApp.token='' \
--ServerApp.password=''
echo "TO BE IMPLEMENTED"
else
echo "ERROR: JUPYTER_MODE is not set to jupyterlab or jupyterhub. Please set JUPYTER_MODE to either jupyterlab or jupyterhub."
exit 1
fi

0 comments on commit a208fa7

Please sign in to comment.