kbase · Tianhao-Gu · Jun 8, 2024 · Jun 7, 2024 · Jun 7, 2024 · Jun 7, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -13,6 +13,7 @@ ENV HADOOP_AWS_VER=3.3.4
 # NOTE: ensure Delta Spark jar version matches python pip delta-spark version specified in the Pipfile
 ENV DELTA_SPARK_VER=3.2.0
 ENV SCALA_VER=2.12
+ENV POSTGRES_JDBC_VER=42.2.23
 
 # Run Gradle task to download JARs to /gradle/gradle_jars location
 COPY build.gradle settings.gradle gradlew /gradle/
@@ -37,6 +38,13 @@ COPY ./src/notebook/startup.py /.ipython/profile_default/startup/
 COPY ./scripts/ /opt/scripts/
 RUN chmod a+x /opt/scripts/*.sh
 
+# Copy the configuration files
+COPY ./config/ /opt/config/
+
+# This is the shared directory between the spark master, worker and driver containers
+ENV CDM_SHARED_DIR=/cdm_shared_workspace
+RUN mkdir -p ${CDM_SHARED_DIR} && chmod -R 777 ${CDM_SHARED_DIR}
+
 # Switch back to the original user
 USER ${ORI_USER}
 

diff --git a/build.gradle b/build.gradle
@@ -16,10 +16,12 @@ repositories {
 def hadoopAwsVersion = System.getenv('HADOOP_AWS_VER')
 def deltaSparkVersion = System.getenv('DELTA_SPARK_VER')
 def scalaVersion = System.getenv('SCALA_VER')
+def postgresVersion = System.getenv('POSTGRES_JDBC_VER')
 
 dependencies {
     implementation "org.apache.hadoop:hadoop-aws:$hadoopAwsVersion"
     implementation "io.delta:delta-spark_${scalaVersion}:$deltaSparkVersion"
+    implementation "org.postgresql:postgresql:$postgresVersion"
 }
 
 task downloadDependencies(type: Copy) {

diff --git a/config/hive-site-template.xml b/config/hive-site-template.xml
@@ -0,0 +1,47 @@
+<configuration>
+  <property>
+    <name>javax.jdo.option.ConnectionURL</name>
+    <value>jdbc:postgresql://{{POSTGRES_URL}}/{{POSTGRES_DB}}</value>
+  </property>
+
+  <!-- JDBC driver class name for PostgreSQL -->
+  <!--  Ensure that PostgreSQL JDBC driver jars are included via Gradle -->
+  <property>
+    <name>javax.jdo.option.ConnectionDriverName</name>
+    <value>org.postgresql.Driver</value>
+  </property>
+  <property>
+    <name>javax.jdo.option.ConnectionUserName</name>
+    <value>{{POSTGRES_USER}}</value>
+  </property>
+  <property>
+    <name>javax.jdo.option.ConnectionPassword</name>
+    <value>{{POSTGRES_PASSWORD}}</value>
+  </property>
+
+  <!-- Configuration to automatically create the necessary tables in the database schema if they don't exist -->
+  <!-- The Hive metastore may fail to start if this property is unset and tables have not been created previously-->
+  <property>
+    <name>datanucleus.schema.autoCreateTables</name>
+    <value>{{DATANUCLEUS_AUTO_CREATE_TABLES}}</value>
+  </property>
+
+  <!-- Disable schema verification in the Hive metastore -->
+  <!-- Hive metastore fails to start if this property is not set-->
+  <property>
+    <name>hive.metastore.schema.verification</name>
+    <value>false</value>
+  </property>
+
+  <!-- Directory location for the Hive warehouse where table data is stored -->
+  <property>
+    <name>hive.metastore.warehouse.dir</name>
+    <value>/cdm_shared_workspace/hive_metastore</value>
+  </property>
+
+  <!-- Enable support for concurrency in Hive, allowing multiple users to access and modify the data simultaneously -->
+  <property>
+    <name>hive.support.concurrency</name>
+    <value>true</value>
+  </property>
+</configuration>
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -14,9 +14,18 @@ services:
     environment:
       - SPARK_MODE=master
       - SPARK_MASTER_WEBUI_PORT=8090
+      - MAX_EXECUTORS=4
+      - POSTGRES_USER=hive
+      - POSTGRES_PASSWORD=hivepassword
+      - POSTGRES_DB=hive
+      - POSTGRES_URL=postgres:5432
+    volumes:
+      - ./cdr/cdm/jupyter:/cdm_shared_workspace
 
   spark-worker-1:
-    image: bitnami/spark:3.5.1
+    build:
+      context: .
+      dockerfile: Dockerfile
     container_name: spark-worker-1
     depends_on:
       - spark-master
@@ -28,9 +37,17 @@ services:
       - SPARK_WORKER_CORES=2
       - SPARK_WORKER_MEMORY=1G
       - SPARK_WORKER_WEBUI_PORT=8081
+      - POSTGRES_USER=hive
+      - POSTGRES_PASSWORD=hivepassword
+      - POSTGRES_DB=hive
+      - POSTGRES_URL=postgres:5432
+    volumes:
+      - ./cdr/cdm/jupyter:/cdm_shared_workspace
 
   spark-worker-2:
-    image: bitnami/spark:3.5.1
+    build:
+      context: .
+      dockerfile: Dockerfile
     container_name: spark-worker-2
     depends_on:
       - spark-master
@@ -42,14 +59,12 @@ services:
       - SPARK_WORKER_CORES=2
       - SPARK_WORKER_MEMORY=1G
       - SPARK_WORKER_WEBUI_PORT=8082
-
-  spark-test-node:
-    image: bitnami/spark:3.5.1
-    container_name: spark-test-node
-    depends_on:
-      - spark-master
-    environment:
-      - SPARK_MASTER_URL=spark://spark-master:7077
+      - POSTGRES_USER=hive
+      - POSTGRES_PASSWORD=hivepassword
+      - POSTGRES_DB=hive
+      - POSTGRES_URL=postgres:5432
+    volumes:
+      - ./cdr/cdm/jupyter:/cdm_shared_workspace
 
   minio:
     image: minio/minio
@@ -78,18 +93,18 @@ services:
     entrypoint: >
       bash -c "
       mc alias set minio http://minio:9002 minio minio123 &&
-      if ! mc ls minio/delta-lake 2>/dev/null; then
-        mc mb minio/delta-lake && echo 'Bucket delta-lake created'
+      if ! mc ls minio/cdm-lake 2>/dev/null; then
+        mc mb minio/cdm-lake && echo 'Bucket cdm-lake created'
       else
-        echo 'bucket delta-lake already exists'
+        echo 'bucket cdm-lake already exists'
       fi
       "
 
-  notebook:
+  dev_notebook:
     build:
       context: .
       dockerfile: Dockerfile
-    container_name: spark-notebook
+    container_name: spark-dev-notebook
     ports:
       - "4041:4041"
     depends_on:
@@ -98,11 +113,61 @@ services:
     environment:
       - NOTEBOOK_PORT=4041
       - SPARK_MASTER_URL=spark://spark-master:7077
-      - SPARK_DRIVER_HOST=spark-notebook
+      - SPARK_DRIVER_HOST=spark-dev-notebook
       - MINIO_URL=http://minio:9002
       - MINIO_ACCESS_KEY=minio
       - MINIO_SECRET_KEY=minio123
       - SPARK_MODE=notebook
-      - MAX_EXECUTORS=2
+      - MAX_EXECUTORS=4
+      - POSTGRES_USER=hive
+      - POSTGRES_PASSWORD=hivepassword
+      - POSTGRES_DB=hive
+      - POSTGRES_URL=postgres:5432
+      - USAGE_MODE=dev  # Enabling dev mode grants full access to MinIO and additional privileges for services like Hive, such as the ability to create tables as defined in the scripts/setup.sh.
+    volumes:
+      - ./cdr/cdm/jupyter:/cdm_shared_workspace
+
+  user_notebook:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    container_name: spark-user-notebook
+    ports:
+      - "4042:4042"
+    depends_on:
+      - spark-master
+      - minio-create-bucket
+    environment:
+      - NOTEBOOK_PORT=4042
+      - SPARK_MASTER_URL=spark://spark-master:7077
+      - SPARK_DRIVER_HOST=spark-user-notebook
+      - MINIO_URL=http://minio:9002
+      - MINIO_ACCESS_KEY=minio  # TODO: create minIO user and policy for read-only access
+      - MINIO_SECRET_KEY=minio123
+      - SPARK_MODE=notebook
+      - MAX_EXECUTORS=4
+      - POSTGRES_USER=hive
+      - POSTGRES_PASSWORD=hivepassword
+      - POSTGRES_DB=hive
+      - POSTGRES_URL=postgres:5432
+    volumes:
+      - ./cdr/cdm/jupyter:/cdm_shared_workspace
+
+  postgres:
+    image: postgres:16.3
+    restart: always
+    container_name: postgres
+    # To avoid incorrect user permissions, manually create the volume directory before running Docker.
+    # export UID=$(id -u)
+    # export GID=$(id -g)
+    # mkdir -p cdr/cdm/jupyter/cdm-postgres
+    # reference: https://forums.docker.com/t/systemd-coredump-taking-ownership-of-tmp-db-directory-and-contents-in-rails-app/93609
+    user: "${UID}:${GID}"
+    ports:
+      - "5432:5432"
+    environment:
+      - POSTGRES_USER=hive
+      - POSTGRES_PASSWORD=hivepassword
+      - POSTGRES_DB=hive
     volumes:
-      - ./cdr/cdm/jupyter:/cdm_shared_workspace
+      - ./cdr/cdm/jupyter/cdm-postgres:/var/lib/postgresql/data  # For local development only. In Rancher development, PostgreSQL data shouldn't be stored in a shared mount.
diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+/opt/scripts/setup.sh
+
 if [ "$SPARK_MODE" = "notebook" ]; then
     exec /opt/scripts/notebook_entrypoint.sh "$@"
 else

diff --git a/scripts/notebook_entrypoint.sh b/scripts/notebook_entrypoint.sh
@@ -2,38 +2,13 @@
 
 echo "starting jupyter notebook"
 
-source /opt/bitnami/scripts/spark-env.sh
-if [ -z "$SPARK_CONF_FILE" ]; then
-    echo "Error: unable to find SPARK_CONF_FILE path"
-    exit 1
-fi
-
-# Set Spark configurations
-{
-    # Set dynamic allocation configurations to allow parallel job executions
-    if [ -z "$MAX_EXECUTORS" ]; then
-      # If MAX_EXECUTORS is not set, default to 5. Adjust as needed.
-      MAX_EXECUTORS=5
-    fi
-    echo "spark.dynamicAllocation.enabled true"
-    echo "spark.dynamicAllocation.minExecutors 1"
-    echo "spark.dynamicAllocation.maxExecutors $MAX_EXECUTORS"
-
-    # Set spark.driver.host if SPARK_DRIVER_HOST is set
-    if [ -n "$SPARK_DRIVER_HOST" ]; then
-        echo "spark.driver.host $SPARK_DRIVER_HOST"
-    fi
-} >> "$SPARK_CONF_FILE"
-
-WORKSPACE_DIR="/cdm_shared_workspace"
-mkdir -p "$WORKSPACE_DIR"
-cd "$WORKSPACE_DIR"
+cd "$CDM_SHARED_DIR"
 
 # Start Jupyter Lab
 jupyter lab --ip=0.0.0.0 \
             --port="$NOTEBOOK_PORT" \
             --no-browser \
             --allow-root \
-            --notebook-dir="$WORKSPACE_DIR" \
+            --notebook-dir="$CDM_SHARED_DIR" \
             --ServerApp.token='' \
             --ServerApp.password=''
diff --git a/scripts/setup.sh b/scripts/setup.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+# This script sets up the Spark environment variables and configurations for Spark master, worker, and driver (Jupyter) nodes.
+
+# Load Spark environment variables
+source /opt/bitnami/scripts/spark-env.sh
+if [ -z "$SPARK_CONF_FILE" ]; then
+    echo "Error: unable to find SPARK_CONF_FILE path"
+    exit 1
+fi
+
+# Set Spark configurations
+{
+    # Set dynamic allocation configurations to allow parallel job executions
+    if [ -z "$MAX_EXECUTORS" ]; then
+      # If MAX_EXECUTORS is not set, default to 5. Adjust as needed.
+      MAX_EXECUTORS=5
+    fi
+    echo "spark.dynamicAllocation.enabled true"
+    echo "spark.dynamicAllocation.minExecutors 1"
+    echo "spark.dynamicAllocation.maxExecutors $MAX_EXECUTORS"
+
+    # Set spark.driver.host if SPARK_DRIVER_HOST is set
+    if [ -n "$SPARK_DRIVER_HOST" ]; then
+        echo "spark.driver.host $SPARK_DRIVER_HOST"
+    fi
+} >> "$SPARK_CONF_FILE"
+
+# Config hive-site.xml for Hive support
+sed -e "s|{{POSTGRES_URL}}|${POSTGRES_URL}|g" \
+    -e "s|{{POSTGRES_DB}}|${POSTGRES_DB}|g" \
+    -e "s|{{POSTGRES_USER}}|${POSTGRES_USER}|g" \
+    -e "s|{{POSTGRES_PASSWORD}}|${POSTGRES_PASSWORD}|g" \
+    /opt/config/hive-site-template.xml > "$SPARK_HOME"/conf/hive-site.xml
+
+
+update_config() {
+    sed -i "s|{{DATANUCLEUS_AUTO_CREATE_TABLES}}|${DATANUCLEUS_AUTO_CREATE_TABLES}|g" "$SPARK_HOME"/conf/hive-site.xml
+}
+
+# Set settings based on server usage
+set_environment() {
+    local lowercase_usage_mode=${USAGE_MODE,,}  # Convert to lowercase
+
+    case "$lowercase_usage_mode" in
+        dev)
+            export DATANUCLEUS_AUTO_CREATE_TABLES=true
+            ;;
+        *)
+            export DATANUCLEUS_AUTO_CREATE_TABLES=false
+            ;;
+    esac
+    update_config
+    echo "Environment settings applied for $USAGE_MODE."
+}
+
+set_environment