diff --git a/Dockerfile b/Dockerfile index 9864ff5..85f498b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,7 @@ RUN groupadd -r spark && useradd -r -g spark spark_user RUN apt-get update && apt-get install -y \ # GCC required to resolve error during JupyterLab installation: psutil could not be installed from sources because gcc is not installed. - gcc curl git graphviz graphviz-dev libgdal-dev build-essential python3-dev\ + gcc curl git npm nodejs graphviz graphviz-dev libgdal-dev build-essential python3-dev\ && rm -rf /var/lib/apt/lists/* ENV HADOOP_AWS_VER=3.3.4 @@ -42,13 +42,25 @@ RUN pipenv sync --system RUN chown -R spark_user:spark /opt/bitnami -# Set up Jupyter directories +# Set up Jupyter Lab directories ENV JUPYTER_CONFIG_DIR=/.jupyter ENV JUPYTER_RUNTIME_DIR=/.jupyter/runtime ENV JUPYTER_DATA_DIR=/.jupyter/data RUN mkdir -p ${JUPYTER_CONFIG_DIR} ${JUPYTER_RUNTIME_DIR} ${JUPYTER_DATA_DIR} RUN chown -R spark_user:spark /.jupyter +# Set up Jupyter Hub directories +ENV JUPYTERHUB_CONFIG_DIR=/srv/jupyterhub +RUN mkdir -p ${JUPYTERHUB_CONFIG_DIR} +COPY ./src/notebook_utils/startup.py ${JUPYTERHUB_CONFIG_DIR}/startup.py +RUN chown -R spark_user:spark ${JUPYTERHUB_CONFIG_DIR} + +# Jupyter Hub user home directory +RUN mkdir -p /jupyterhub/users_home +RUN chown -R spark_user:spark /jupyterhub/users_home + +RUN npm install -g configurable-http-proxy + COPY ./src/ /src ENV PYTHONPATH "${PYTHONPATH}:/src" diff --git a/docker-compose.yaml b/docker-compose.yaml index 8ec8ee1..11c4bcd 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -123,11 +123,11 @@ services: - ./config/yarn-write-policy.json:/config/yarn-write-policy.json - ./scripts/minio_create_bucket_entrypoint.sh:/scripts/minio_create_bucket_entrypoint.sh - dev_notebook: + dev_jupyterlab: build: context: . dockerfile: Dockerfile - container_name: spark-dev-notebook + container_name: dev-jupyterlab ports: - "4041:4041" depends_on: @@ -135,9 +135,10 @@ services: - minio-create-bucket environment: - NOTEBOOK_PORT=4041 + - JUPYTER_MODE=jupyterlab - YARN_RESOURCE_MANAGER_URL=http://yarn-resourcemanager:8032 - SPARK_MASTER_URL=spark://spark-master:7077 - - SPARK_DRIVER_HOST=spark-dev-notebook + - SPARK_DRIVER_HOST=dev-jupyterlab - MINIO_URL=http://minio:9002 - MINIO_ACCESS_KEY=minio-readwrite - MINIO_SECRET_KEY=minio123 @@ -151,11 +152,11 @@ services: volumes: - ./cdr/cdm/jupyter:/cdm_shared_workspace - user_notebook: + user-jupyterlab: build: context: . dockerfile: Dockerfile - container_name: spark-user-notebook + container_name: user-jupyterlab ports: - "4042:4042" depends_on: @@ -163,15 +164,16 @@ services: - minio-create-bucket environment: - NOTEBOOK_PORT=4042 + - JUPYTER_MODE=jupyterlab - YARN_RESOURCE_MANAGER_URL=http://yarn-resourcemanager:8032 - SPARK_MASTER_URL=spark://spark-master:7077 - - SPARK_DRIVER_HOST=spark-user-notebook + - SPARK_DRIVER_HOST=user-jupyterlab - MINIO_URL=http://minio:9002 - MINIO_ACCESS_KEY=minio-readonly - MINIO_SECRET_KEY=minio123 - S3_YARN_BUCKET=yarn - MAX_EXECUTORS=4 - # TODO: create postgres user w/ only write access to the hive tables + # TODO: create postgres user r/ only read access to the hive tables - POSTGRES_USER=hive - POSTGRES_PASSWORD=hivepassword - POSTGRES_DB=hive @@ -179,6 +181,65 @@ services: volumes: - ./cdr/cdm/jupyter/user_shared_workspace:/cdm_shared_workspace/user_shared_workspace + dev_jupyterhub: + build: + context: . + dockerfile: Dockerfile + container_name: dev-jupyterhub + ports: + - "4043:4043" + depends_on: + - spark-master + - minio-create-bucket + environment: + - NOTEBOOK_PORT=4043 + - JUPYTER_MODE=jupyterhub + - YARN_RESOURCE_MANAGER_URL=http://yarn-resourcemanager:8032 + - SPARK_MASTER_URL=spark://spark-master:7077 + - SPARK_DRIVER_HOST=dev-jupterhub + - MINIO_URL=http://minio:9002 + - MINIO_ACCESS_KEY=minio-readwrite + - MINIO_SECRET_KEY=minio123 + - S3_YARN_BUCKET=yarn + - MAX_EXECUTORS=4 + - POSTGRES_USER=hive + - POSTGRES_PASSWORD=hivepassword + - POSTGRES_DB=hive + - POSTGRES_URL=postgres:5432 + - USAGE_MODE=dev + volumes: + - ./cdr/cdm/jupyter:/cdm_shared_workspace + - ./cdr/cdm/jupyter/jupyterhub/users_home:/jupyterhub/users_home + + user_jupyterhub: + build: + context: . + dockerfile: Dockerfile + container_name: user-jupyterhub + ports: + - "4044:4044" + depends_on: + - spark-master + - minio-create-bucket + environment: + - NOTEBOOK_PORT=4044 + - JUPYTER_MODE=jupyterhub + - YARN_RESOURCE_MANAGER_URL=http://yarn-resourcemanager:8032 + - SPARK_MASTER_URL=spark://spark-master:7077 + - SPARK_DRIVER_HOST=user-jupyterhub + - MINIO_URL=http://minio:9002 + - MINIO_ACCESS_KEY=minio-readonly + - MINIO_SECRET_KEY=minio123 + - S3_YARN_BUCKET=yarn + - JUPYTER_MODE=jupyterhub + - MAX_EXECUTORS=4 + - POSTGRES_USER=hive + - POSTGRES_PASSWORD=hivepassword + - POSTGRES_DB=hive + - POSTGRES_URL=postgres:5432 + volumes: + - ./cdr/cdm/jupyter/jupyterhub/users_home:/jupyterhub/users_home + postgres: image: postgres:16.3 restart: always diff --git a/scripts/notebook_entrypoint.sh b/scripts/notebook_entrypoint.sh index daf3adf..f9a766f 100644 --- a/scripts/notebook_entrypoint.sh +++ b/scripts/notebook_entrypoint.sh @@ -1,7 +1,5 @@ #!/bin/bash -echo "starting jupyter notebook" - # Ensure NOTEBOOK_DIR is set if [ -z "$NOTEBOOK_DIR" ]; then echo "ERROR: NOTEBOOK_DIR is not set. Please run setup.sh first." @@ -10,17 +8,28 @@ fi mkdir -p "$NOTEBOOK_DIR" && cd "$NOTEBOOK_DIR" -# install Plotly extension -jupyter labextension install jupyterlab-plotly@5.23.0 -# install ipywidgets extension -jupyter labextension install @jupyter-widgets/jupyterlab-manager@8.1.3 +if [ "$JUPYTER_MODE" = "jupyterlab" ]; then + echo "starting jupyterlab" + # install Plotly extension + jupyter labextension install jupyterlab-plotly@5.23.0 + + # install ipywidgets extension + jupyter labextension install @jupyter-widgets/jupyterlab-manager@8.1.3 + + # Start Jupyter Lab + jupyter lab --ip=0.0.0.0 \ + --port="$NOTEBOOK_PORT" \ + --no-browser \ + --allow-root \ + --notebook-dir="$NOTEBOOK_DIR" \ + --ServerApp.token='' \ + --ServerApp.password='' +elif [ "$JUPYTER_MODE" = "jupyterhub" ]; then + echo "starting jupyterhub" -# Start Jupyter Lab -jupyter lab --ip=0.0.0.0 \ - --port="$NOTEBOOK_PORT" \ - --no-browser \ - --allow-root \ - --notebook-dir="$NOTEBOOK_DIR" \ - --ServerApp.token='' \ - --ServerApp.password='' \ No newline at end of file + echo "TO BE IMPLEMENTED" +else + echo "ERROR: JUPYTER_MODE is not set to jupyterlab or jupyterhub. Please set JUPYTER_MODE to either jupyterlab or jupyterhub." + exit 1 +fi \ No newline at end of file