From 128e2a3eb5c35ea447c88e477d2971b83f59bbba Mon Sep 17 00:00:00 2001
From: Tianhao-Gu <tgu@anl.gov>
Date: Fri, 7 Jun 2024 09:16:22 -0500
Subject: [PATCH] using postgres as hive metastore

---
 Dockerfile                     |  8 ++++
 build.gradle                   |  2 +
 config/hive-site-template.xml  | 47 +++++++++++++++++++++++
 docker-compose.yaml            | 68 ++++++++++++++++++++++++++--------
 scripts/entrypoint.sh          |  2 +
 scripts/notebook_entrypoint.sh | 29 +--------------
 scripts/setup.sh               | 34 +++++++++++++++++
 7 files changed, 148 insertions(+), 42 deletions(-)
 create mode 100644 config/hive-site-template.xml
 create mode 100644 scripts/setup.sh
diff --git a/Dockerfile b/Dockerfile
index f23775e..fcf9eba 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -13,6 +13,7 @@ ENV HADOOP_AWS_VER=3.3.4
 # NOTE: ensure Delta Spark jar version matches python pip delta-spark version specified in the Pipfile
 ENV DELTA_SPARK_VER=3.2.0
 ENV SCALA_VER=2.12
+ENV POSTGRES_JDBC_VER=42.2.23
 
 # Run Gradle task to download JARs to /gradle/gradle_jars location
 COPY build.gradle settings.gradle gradlew /gradle/
@@ -37,6 +38,13 @@ COPY ./src/notebook/startup.py /.ipython/profile_default/startup/
 COPY ./scripts/ /opt/scripts/
 RUN chmod a+x /opt/scripts/*.sh
 
+# Copy the configuration files
+COPY ./config/ /opt/config/
+
+# This is the shared directory between the spark master, worker and driver containers
+ENV CDM_SHARED_DIR=/cdm_shared_workspace
+RUN mkdir -p ${CDM_SHARED_DIR} && chmod -R 777 ${CDM_SHARED_DIR}
+
 # Switch back to the original user
 USER ${ORI_USER}
 
diff --git a/build.gradle b/build.gradle
index dae2d81..68c38af 100644
--- a/build.gradle
+++ b/build.gradle
@@ -16,10 +16,12 @@ repositories {
 def hadoopAwsVersion = System.getenv('HADOOP_AWS_VER')
 def deltaSparkVersion = System.getenv('DELTA_SPARK_VER')
 def scalaVersion = System.getenv('SCALA_VER')
+def postgresVersion = System.getenv('POSTGRES_JDBC_VER')
 
 dependencies {
     implementation "org.apache.hadoop:hadoop-aws:$hadoopAwsVersion"
     implementation "io.delta:delta-spark_${scalaVersion}:$deltaSparkVersion"
+    implementation "org.postgresql:postgresql:$postgresVersion"
 }
 
 task downloadDependencies(type: Copy) {
diff --git a/config/hive-site-template.xml b/config/hive-site-template.xml
new file mode 100644
index 0000000..75ddd77
--- /dev/null
+++ b/config/hive-site-template.xml
@@ -0,0 +1,47 @@
+<configuration>
+  <property>
+    <name>javax.jdo.option.ConnectionURL</name>
+    <value>jdbc:postgresql://{{POSTGRES_URL}}/{{POSTGRES_DB}}</value>
+  </property>
+
+  <!-- JDBC driver class name for PostgreSQL -->
+  <!--  Ensure that PostgreSQL JDBC driver jars are included via Gradle -->
+  <property>
+    <name>javax.jdo.option.ConnectionDriverName</name>
+    <value>org.postgresql.Driver</value>
+  </property>
+  <property>
+    <name>javax.jdo.option.ConnectionUserName</name>
+    <value>{{POSTGRES_USER}}</value>
+  </property>
+  <property>
+    <name>javax.jdo.option.ConnectionPassword</name>
+    <value>{{POSTGRES_PASSWORD}}</value>
+  </property>
+
+  <!-- Configuration to automatically create the necessary tables in the database schema if they don't exist -->
+  <!-- Hive metastore fails to start if this property is not set-->
+  <property>
+    <name>datanucleus.schema.autoCreateTables</name>
+    <value>true</value>
+  </property>
+
+  <!-- Disable schema verification in the Hive metastore -->
+  <!-- Hive metastore fails to start if this property is not set-->
+  <property>
+    <name>hive.metastore.schema.verification</name>
+    <value>false</value>
+  </property>
+
+  <!-- Directory location for the Hive warehouse where table data is stored -->
+  <property>
+    <name>hive.metastore.warehouse.dir</name>
+    <value>/cdm_shared_workspace/hive_metastore</value>
+  </property>
+
+  <!-- Enable support for concurrency in Hive, allowing multiple users to access and modify the data simultaneously -->
+  <property>
+    <name>hive.support.concurrency</name>
+    <value>true</value>
+  </property>
+</configuration>
diff --git a/docker-compose.yaml b/docker-compose.yaml
index e82408f..668645d 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -14,9 +14,18 @@ services:
     environment:
       - SPARK_MODE=master
       - SPARK_MASTER_WEBUI_PORT=8090
+      - MAX_EXECUTORS=4
+      - POSTGRES_USER=hive
+      - POSTGRES_PASSWORD=hivepassword
+      - POSTGRES_DB=hive
+      - POSTGRES_URL=postgres:5432
+    volumes:
+      - ./cdr/cdm/jupyter:/cdm_shared_workspace
 
   spark-worker-1:
-    image: bitnami/spark:3.5.1
+    build:
+      context: .
+      dockerfile: Dockerfile
     container_name: spark-worker-1
     depends_on:
       - spark-master
@@ -28,9 +37,17 @@ services:
       - SPARK_WORKER_CORES=2
       - SPARK_WORKER_MEMORY=1G
       - SPARK_WORKER_WEBUI_PORT=8081
+      - POSTGRES_USER=hive
+      - POSTGRES_PASSWORD=hivepassword
+      - POSTGRES_DB=hive
+      - POSTGRES_URL=postgres:5432
+    volumes:
+      - ./cdr/cdm/jupyter:/cdm_shared_workspace
 
   spark-worker-2:
-    image: bitnami/spark:3.5.1
+    build:
+      context: .
+      dockerfile: Dockerfile
     container_name: spark-worker-2
     depends_on:
       - spark-master
@@ -42,14 +59,12 @@ services:
       - SPARK_WORKER_CORES=2
       - SPARK_WORKER_MEMORY=1G
       - SPARK_WORKER_WEBUI_PORT=8082
-
-  spark-test-node:
-    image: bitnami/spark:3.5.1
-    container_name: spark-test-node
-    depends_on:
-      - spark-master
-    environment:
-      - SPARK_MASTER_URL=spark://spark-master:7077
+      - POSTGRES_USER=hive
+      - POSTGRES_PASSWORD=hivepassword
+      - POSTGRES_DB=hive
+      - POSTGRES_URL=postgres:5432
+    volumes:
+      - ./cdr/cdm/jupyter:/cdm_shared_workspace
 
   minio:
     image: minio/minio
@@ -78,10 +93,10 @@ services:
     entrypoint: >
       bash -c "
       mc alias set minio http://minio:9002 minio minio123 &&
-      if ! mc ls minio/delta-lake 2>/dev/null; then
-        mc mb minio/delta-lake && echo 'Bucket delta-lake created'
+      if ! mc ls minio/cdm-lake 2>/dev/null; then
+        mc mb minio/cdm-lake && echo 'Bucket cdm-lake created'
       else
-        echo 'bucket delta-lake already exists'
+        echo 'bucket cdm-lake already exists'
       fi
       "
 
@@ -103,6 +118,29 @@ services:
       - MINIO_ACCESS_KEY=minio
       - MINIO_SECRET_KEY=minio123
       - SPARK_MODE=notebook
-      - MAX_EXECUTORS=2
+      - MAX_EXECUTORS=4
+      - POSTGRES_USER=hive
+      - POSTGRES_PASSWORD=hivepassword
+      - POSTGRES_DB=hive
+      - POSTGRES_URL=postgres:5432
+    volumes:
+      - ./cdr/cdm/jupyter:/cdm_shared_workspace
+
+  postgres:
+    image: postgres:16.3
+    restart: always
+    container_name: postgres
+    # To avoid incorrect user permissions, manually create the volume directory before running Docker.
+    # export UID=$(id -u)
+    # export GID=$(id -g)
+    # mkdir -p cdr/cdm/jupyter/cdm-postgres
+    # reference: https://forums.docker.com/t/systemd-coredump-taking-ownership-of-tmp-db-directory-and-contents-in-rails-app/93609
+    user: "${UID}:${GID}"
+    ports:
+      - "5432:5432"
+    environment:
+      - POSTGRES_USER=hive
+      - POSTGRES_PASSWORD=hivepassword
+      - POSTGRES_DB=hive
     volumes:
-      - ./cdr/cdm/jupyter:/cdm_shared_workspace
\ No newline at end of file
+      - ./cdr/cdm/jupyter/cdm-postgres:/var/lib/postgresql/data
\ No newline at end of file
diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh
index 35ab63c..c5bf6ee 100644
--- a/scripts/entrypoint.sh
+++ b/scripts/entrypoint.sh
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+/opt/scripts/setup.sh
+
 if [ "$SPARK_MODE" = "notebook" ]; then
     exec /opt/scripts/notebook_entrypoint.sh "$@"
 else
diff --git a/scripts/notebook_entrypoint.sh b/scripts/notebook_entrypoint.sh
index 920ccb2..ad8b0b1 100644
--- a/scripts/notebook_entrypoint.sh
+++ b/scripts/notebook_entrypoint.sh
@@ -2,38 +2,13 @@
 
 echo "starting jupyter notebook"
 
-source /opt/bitnami/scripts/spark-env.sh
-if [ -z "$SPARK_CONF_FILE" ]; then
-    echo "Error: unable to find SPARK_CONF_FILE path"
-    exit 1
-fi
-
-# Set Spark configurations
-{
-    # Set dynamic allocation configurations to allow parallel job executions
-    if [ -z "$MAX_EXECUTORS" ]; then
-      # If MAX_EXECUTORS is not set, default to 5. Adjust as needed.
-      MAX_EXECUTORS=5
-    fi
-    echo "spark.dynamicAllocation.enabled true"
-    echo "spark.dynamicAllocation.minExecutors 1"
-    echo "spark.dynamicAllocation.maxExecutors $MAX_EXECUTORS"
-
-    # Set spark.driver.host if SPARK_DRIVER_HOST is set
-    if [ -n "$SPARK_DRIVER_HOST" ]; then
-        echo "spark.driver.host $SPARK_DRIVER_HOST"
-    fi
-} >> "$SPARK_CONF_FILE"
-
-WORKSPACE_DIR="/cdm_shared_workspace"
-mkdir -p "$WORKSPACE_DIR"
-cd "$WORKSPACE_DIR"
+cd "$CDM_SHARED_DIR"
 
 # Start Jupyter Lab
 jupyter lab --ip=0.0.0.0 \
             --port="$NOTEBOOK_PORT" \
             --no-browser \
             --allow-root \
-            --notebook-dir="$WORKSPACE_DIR" \
+            --notebook-dir="$CDM_SHARED_DIR" \
             --ServerApp.token='' \
             --ServerApp.password=''
\ No newline at end of file
diff --git a/scripts/setup.sh b/scripts/setup.sh
new file mode 100644
index 0000000..c4a57d3
--- /dev/null
+++ b/scripts/setup.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# This script sets up the Spark environment variables and configurations for Spark master, worker, and driver (Jupyter) nodes.
+
+# Load Spark environment variables
+source /opt/bitnami/scripts/spark-env.sh
+if [ -z "$SPARK_CONF_FILE" ]; then
+    echo "Error: unable to find SPARK_CONF_FILE path"
+    exit 1
+fi
+
+# Set Spark configurations
+{
+    # Set dynamic allocation configurations to allow parallel job executions
+    if [ -z "$MAX_EXECUTORS" ]; then
+      # If MAX_EXECUTORS is not set, default to 5. Adjust as needed.
+      MAX_EXECUTORS=5
+    fi
+    echo "spark.dynamicAllocation.enabled true"
+    echo "spark.dynamicAllocation.minExecutors 1"
+    echo "spark.dynamicAllocation.maxExecutors $MAX_EXECUTORS"
+
+    # Set spark.driver.host if SPARK_DRIVER_HOST is set
+    if [ -n "$SPARK_DRIVER_HOST" ]; then
+        echo "spark.driver.host $SPARK_DRIVER_HOST"
+    fi
+} >> "$SPARK_CONF_FILE"
+
+# Config hive-site.xml for Hive support
+sed -e "s|{{POSTGRES_URL}}|${POSTGRES_URL}|g" \
+    -e "s|{{POSTGRES_DB}}|${POSTGRES_DB}|g" \
+    -e "s|{{POSTGRES_USER}}|${POSTGRES_USER}|g" \
+    -e "s|{{POSTGRES_PASSWORD}}|${POSTGRES_PASSWORD}|g" \
+    /opt/config/hive-site-template.xml > "$SPARK_HOME"/conf/hive-site.xml