Skip to content

Commit

Permalink
using postgres as hive metastore
Browse files Browse the repository at this point in the history
  • Loading branch information
Tianhao-Gu committed Jun 7, 2024
1 parent 25f67eb commit 128e2a3
Show file tree
Hide file tree
Showing 7 changed files with 148 additions and 42 deletions.
8 changes: 8 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ ENV HADOOP_AWS_VER=3.3.4
# NOTE: ensure Delta Spark jar version matches python pip delta-spark version specified in the Pipfile
ENV DELTA_SPARK_VER=3.2.0
ENV SCALA_VER=2.12
ENV POSTGRES_JDBC_VER=42.2.23

# Run Gradle task to download JARs to /gradle/gradle_jars location
COPY build.gradle settings.gradle gradlew /gradle/
Expand All @@ -37,6 +38,13 @@ COPY ./src/notebook/startup.py /.ipython/profile_default/startup/
COPY ./scripts/ /opt/scripts/
RUN chmod a+x /opt/scripts/*.sh

# Copy the configuration files
COPY ./config/ /opt/config/

# This is the shared directory between the spark master, worker and driver containers
ENV CDM_SHARED_DIR=/cdm_shared_workspace
RUN mkdir -p ${CDM_SHARED_DIR} && chmod -R 777 ${CDM_SHARED_DIR}

# Switch back to the original user
USER ${ORI_USER}

Expand Down
2 changes: 2 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@ repositories {
def hadoopAwsVersion = System.getenv('HADOOP_AWS_VER')
def deltaSparkVersion = System.getenv('DELTA_SPARK_VER')
def scalaVersion = System.getenv('SCALA_VER')
def postgresVersion = System.getenv('POSTGRES_JDBC_VER')

dependencies {
implementation "org.apache.hadoop:hadoop-aws:$hadoopAwsVersion"
implementation "io.delta:delta-spark_${scalaVersion}:$deltaSparkVersion"
implementation "org.postgresql:postgresql:$postgresVersion"
}

task downloadDependencies(type: Copy) {
Expand Down
47 changes: 47 additions & 0 deletions config/hive-site-template.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:postgresql://{{POSTGRES_URL}}/{{POSTGRES_DB}}</value>
</property>

<!-- JDBC driver class name for PostgreSQL -->
<!-- Ensure that PostgreSQL JDBC driver jars are included via Gradle -->
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>org.postgresql.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>{{POSTGRES_USER}}</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>{{POSTGRES_PASSWORD}}</value>
</property>

<!-- Configuration to automatically create the necessary tables in the database schema if they don't exist -->
<!-- Hive metastore fails to start if this property is not set-->
<property>
<name>datanucleus.schema.autoCreateTables</name>
<value>true</value>
</property>

<!-- Disable schema verification in the Hive metastore -->
<!-- Hive metastore fails to start if this property is not set-->
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>

<!-- Directory location for the Hive warehouse where table data is stored -->
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/cdm_shared_workspace/hive_metastore</value>
</property>

<!-- Enable support for concurrency in Hive, allowing multiple users to access and modify the data simultaneously -->
<property>
<name>hive.support.concurrency</name>
<value>true</value>
</property>
</configuration>
68 changes: 53 additions & 15 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,18 @@ services:
environment:
- SPARK_MODE=master
- SPARK_MASTER_WEBUI_PORT=8090
- MAX_EXECUTORS=4
- POSTGRES_USER=hive
- POSTGRES_PASSWORD=hivepassword
- POSTGRES_DB=hive
- POSTGRES_URL=postgres:5432
volumes:
- ./cdr/cdm/jupyter:/cdm_shared_workspace

spark-worker-1:
image: bitnami/spark:3.5.1
build:
context: .
dockerfile: Dockerfile
container_name: spark-worker-1
depends_on:
- spark-master
Expand All @@ -28,9 +37,17 @@ services:
- SPARK_WORKER_CORES=2
- SPARK_WORKER_MEMORY=1G
- SPARK_WORKER_WEBUI_PORT=8081
- POSTGRES_USER=hive
- POSTGRES_PASSWORD=hivepassword
- POSTGRES_DB=hive
- POSTGRES_URL=postgres:5432
volumes:
- ./cdr/cdm/jupyter:/cdm_shared_workspace

spark-worker-2:
image: bitnami/spark:3.5.1
build:
context: .
dockerfile: Dockerfile
container_name: spark-worker-2
depends_on:
- spark-master
Expand All @@ -42,14 +59,12 @@ services:
- SPARK_WORKER_CORES=2
- SPARK_WORKER_MEMORY=1G
- SPARK_WORKER_WEBUI_PORT=8082

spark-test-node:
image: bitnami/spark:3.5.1
container_name: spark-test-node
depends_on:
- spark-master
environment:
- SPARK_MASTER_URL=spark://spark-master:7077
- POSTGRES_USER=hive
- POSTGRES_PASSWORD=hivepassword
- POSTGRES_DB=hive
- POSTGRES_URL=postgres:5432
volumes:
- ./cdr/cdm/jupyter:/cdm_shared_workspace

minio:
image: minio/minio
Expand Down Expand Up @@ -78,10 +93,10 @@ services:
entrypoint: >
bash -c "
mc alias set minio http://minio:9002 minio minio123 &&
if ! mc ls minio/delta-lake 2>/dev/null; then
mc mb minio/delta-lake && echo 'Bucket delta-lake created'
if ! mc ls minio/cdm-lake 2>/dev/null; then
mc mb minio/cdm-lake && echo 'Bucket cdm-lake created'
else
echo 'bucket delta-lake already exists'
echo 'bucket cdm-lake already exists'
fi
"
Expand All @@ -103,6 +118,29 @@ services:
- MINIO_ACCESS_KEY=minio
- MINIO_SECRET_KEY=minio123
- SPARK_MODE=notebook
- MAX_EXECUTORS=2
- MAX_EXECUTORS=4
- POSTGRES_USER=hive
- POSTGRES_PASSWORD=hivepassword
- POSTGRES_DB=hive
- POSTGRES_URL=postgres:5432
volumes:
- ./cdr/cdm/jupyter:/cdm_shared_workspace

postgres:
image: postgres:16.3
restart: always
container_name: postgres
# To avoid incorrect user permissions, manually create the volume directory before running Docker.
# export UID=$(id -u)
# export GID=$(id -g)
# mkdir -p cdr/cdm/jupyter/cdm-postgres
# reference: https://forums.docker.com/t/systemd-coredump-taking-ownership-of-tmp-db-directory-and-contents-in-rails-app/93609
user: "${UID}:${GID}"
ports:
- "5432:5432"
environment:
- POSTGRES_USER=hive
- POSTGRES_PASSWORD=hivepassword
- POSTGRES_DB=hive
volumes:
- ./cdr/cdm/jupyter:/cdm_shared_workspace
- ./cdr/cdm/jupyter/cdm-postgres:/var/lib/postgresql/data
2 changes: 2 additions & 0 deletions scripts/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/bin/bash

/opt/scripts/setup.sh

if [ "$SPARK_MODE" = "notebook" ]; then
exec /opt/scripts/notebook_entrypoint.sh "$@"
else
Expand Down
29 changes: 2 additions & 27 deletions scripts/notebook_entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,13 @@

echo "starting jupyter notebook"

source /opt/bitnami/scripts/spark-env.sh
if [ -z "$SPARK_CONF_FILE" ]; then
echo "Error: unable to find SPARK_CONF_FILE path"
exit 1
fi

# Set Spark configurations
{
# Set dynamic allocation configurations to allow parallel job executions
if [ -z "$MAX_EXECUTORS" ]; then
# If MAX_EXECUTORS is not set, default to 5. Adjust as needed.
MAX_EXECUTORS=5
fi
echo "spark.dynamicAllocation.enabled true"
echo "spark.dynamicAllocation.minExecutors 1"
echo "spark.dynamicAllocation.maxExecutors $MAX_EXECUTORS"

# Set spark.driver.host if SPARK_DRIVER_HOST is set
if [ -n "$SPARK_DRIVER_HOST" ]; then
echo "spark.driver.host $SPARK_DRIVER_HOST"
fi
} >> "$SPARK_CONF_FILE"

WORKSPACE_DIR="/cdm_shared_workspace"
mkdir -p "$WORKSPACE_DIR"
cd "$WORKSPACE_DIR"
cd "$CDM_SHARED_DIR"

# Start Jupyter Lab
jupyter lab --ip=0.0.0.0 \
--port="$NOTEBOOK_PORT" \
--no-browser \
--allow-root \
--notebook-dir="$WORKSPACE_DIR" \
--notebook-dir="$CDM_SHARED_DIR" \
--ServerApp.token='' \
--ServerApp.password=''
34 changes: 34 additions & 0 deletions scripts/setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash

# This script sets up the Spark environment variables and configurations for Spark master, worker, and driver (Jupyter) nodes.

# Load Spark environment variables
source /opt/bitnami/scripts/spark-env.sh
if [ -z "$SPARK_CONF_FILE" ]; then
echo "Error: unable to find SPARK_CONF_FILE path"
exit 1
fi

# Set Spark configurations
{
# Set dynamic allocation configurations to allow parallel job executions
if [ -z "$MAX_EXECUTORS" ]; then
# If MAX_EXECUTORS is not set, default to 5. Adjust as needed.
MAX_EXECUTORS=5
fi
echo "spark.dynamicAllocation.enabled true"
echo "spark.dynamicAllocation.minExecutors 1"
echo "spark.dynamicAllocation.maxExecutors $MAX_EXECUTORS"

# Set spark.driver.host if SPARK_DRIVER_HOST is set
if [ -n "$SPARK_DRIVER_HOST" ]; then
echo "spark.driver.host $SPARK_DRIVER_HOST"
fi
} >> "$SPARK_CONF_FILE"

# Config hive-site.xml for Hive support
sed -e "s|{{POSTGRES_URL}}|${POSTGRES_URL}|g" \
-e "s|{{POSTGRES_DB}}|${POSTGRES_DB}|g" \
-e "s|{{POSTGRES_USER}}|${POSTGRES_USER}|g" \
-e "s|{{POSTGRES_PASSWORD}}|${POSTGRES_PASSWORD}|g" \
/opt/config/hive-site-template.xml > "$SPARK_HOME"/conf/hive-site.xml

0 comments on commit 128e2a3

Please sign in to comment.