From 128e2a3eb5c35ea447c88e477d2971b83f59bbba Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Fri, 7 Jun 2024 09:16:22 -0500 Subject: [PATCH] using postgres as hive metastore --- Dockerfile | 8 ++++ build.gradle | 2 + config/hive-site-template.xml | 47 +++++++++++++++++++++++ docker-compose.yaml | 68 ++++++++++++++++++++++++++-------- scripts/entrypoint.sh | 2 + scripts/notebook_entrypoint.sh | 29 +-------------- scripts/setup.sh | 34 +++++++++++++++++ 7 files changed, 148 insertions(+), 42 deletions(-) create mode 100644 config/hive-site-template.xml create mode 100644 scripts/setup.sh diff --git a/Dockerfile b/Dockerfile index f23775e..fcf9eba 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,6 +13,7 @@ ENV HADOOP_AWS_VER=3.3.4 # NOTE: ensure Delta Spark jar version matches python pip delta-spark version specified in the Pipfile ENV DELTA_SPARK_VER=3.2.0 ENV SCALA_VER=2.12 +ENV POSTGRES_JDBC_VER=42.2.23 # Run Gradle task to download JARs to /gradle/gradle_jars location COPY build.gradle settings.gradle gradlew /gradle/ @@ -37,6 +38,13 @@ COPY ./src/notebook/startup.py /.ipython/profile_default/startup/ COPY ./scripts/ /opt/scripts/ RUN chmod a+x /opt/scripts/*.sh +# Copy the configuration files +COPY ./config/ /opt/config/ + +# This is the shared directory between the spark master, worker and driver containers +ENV CDM_SHARED_DIR=/cdm_shared_workspace +RUN mkdir -p ${CDM_SHARED_DIR} && chmod -R 777 ${CDM_SHARED_DIR} + # Switch back to the original user USER ${ORI_USER} diff --git a/build.gradle b/build.gradle index dae2d81..68c38af 100644 --- a/build.gradle +++ b/build.gradle @@ -16,10 +16,12 @@ repositories { def hadoopAwsVersion = System.getenv('HADOOP_AWS_VER') def deltaSparkVersion = System.getenv('DELTA_SPARK_VER') def scalaVersion = System.getenv('SCALA_VER') +def postgresVersion = System.getenv('POSTGRES_JDBC_VER') dependencies { implementation "org.apache.hadoop:hadoop-aws:$hadoopAwsVersion" implementation "io.delta:delta-spark_${scalaVersion}:$deltaSparkVersion" + implementation "org.postgresql:postgresql:$postgresVersion" } task downloadDependencies(type: Copy) { diff --git a/config/hive-site-template.xml b/config/hive-site-template.xml new file mode 100644 index 0000000..75ddd77 --- /dev/null +++ b/config/hive-site-template.xml @@ -0,0 +1,47 @@ + + + javax.jdo.option.ConnectionURL + jdbc:postgresql://{{POSTGRES_URL}}/{{POSTGRES_DB}} + + + + + + javax.jdo.option.ConnectionDriverName + org.postgresql.Driver + + + javax.jdo.option.ConnectionUserName + {{POSTGRES_USER}} + + + javax.jdo.option.ConnectionPassword + {{POSTGRES_PASSWORD}} + + + + + + datanucleus.schema.autoCreateTables + true + + + + + + hive.metastore.schema.verification + false + + + + + hive.metastore.warehouse.dir + /cdm_shared_workspace/hive_metastore + + + + + hive.support.concurrency + true + + diff --git a/docker-compose.yaml b/docker-compose.yaml index e82408f..668645d 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -14,9 +14,18 @@ services: environment: - SPARK_MODE=master - SPARK_MASTER_WEBUI_PORT=8090 + - MAX_EXECUTORS=4 + - POSTGRES_USER=hive + - POSTGRES_PASSWORD=hivepassword + - POSTGRES_DB=hive + - POSTGRES_URL=postgres:5432 + volumes: + - ./cdr/cdm/jupyter:/cdm_shared_workspace spark-worker-1: - image: bitnami/spark:3.5.1 + build: + context: . + dockerfile: Dockerfile container_name: spark-worker-1 depends_on: - spark-master @@ -28,9 +37,17 @@ services: - SPARK_WORKER_CORES=2 - SPARK_WORKER_MEMORY=1G - SPARK_WORKER_WEBUI_PORT=8081 + - POSTGRES_USER=hive + - POSTGRES_PASSWORD=hivepassword + - POSTGRES_DB=hive + - POSTGRES_URL=postgres:5432 + volumes: + - ./cdr/cdm/jupyter:/cdm_shared_workspace spark-worker-2: - image: bitnami/spark:3.5.1 + build: + context: . + dockerfile: Dockerfile container_name: spark-worker-2 depends_on: - spark-master @@ -42,14 +59,12 @@ services: - SPARK_WORKER_CORES=2 - SPARK_WORKER_MEMORY=1G - SPARK_WORKER_WEBUI_PORT=8082 - - spark-test-node: - image: bitnami/spark:3.5.1 - container_name: spark-test-node - depends_on: - - spark-master - environment: - - SPARK_MASTER_URL=spark://spark-master:7077 + - POSTGRES_USER=hive + - POSTGRES_PASSWORD=hivepassword + - POSTGRES_DB=hive + - POSTGRES_URL=postgres:5432 + volumes: + - ./cdr/cdm/jupyter:/cdm_shared_workspace minio: image: minio/minio @@ -78,10 +93,10 @@ services: entrypoint: > bash -c " mc alias set minio http://minio:9002 minio minio123 && - if ! mc ls minio/delta-lake 2>/dev/null; then - mc mb minio/delta-lake && echo 'Bucket delta-lake created' + if ! mc ls minio/cdm-lake 2>/dev/null; then + mc mb minio/cdm-lake && echo 'Bucket cdm-lake created' else - echo 'bucket delta-lake already exists' + echo 'bucket cdm-lake already exists' fi " @@ -103,6 +118,29 @@ services: - MINIO_ACCESS_KEY=minio - MINIO_SECRET_KEY=minio123 - SPARK_MODE=notebook - - MAX_EXECUTORS=2 + - MAX_EXECUTORS=4 + - POSTGRES_USER=hive + - POSTGRES_PASSWORD=hivepassword + - POSTGRES_DB=hive + - POSTGRES_URL=postgres:5432 + volumes: + - ./cdr/cdm/jupyter:/cdm_shared_workspace + + postgres: + image: postgres:16.3 + restart: always + container_name: postgres + # To avoid incorrect user permissions, manually create the volume directory before running Docker. + # export UID=$(id -u) + # export GID=$(id -g) + # mkdir -p cdr/cdm/jupyter/cdm-postgres + # reference: https://forums.docker.com/t/systemd-coredump-taking-ownership-of-tmp-db-directory-and-contents-in-rails-app/93609 + user: "${UID}:${GID}" + ports: + - "5432:5432" + environment: + - POSTGRES_USER=hive + - POSTGRES_PASSWORD=hivepassword + - POSTGRES_DB=hive volumes: - - ./cdr/cdm/jupyter:/cdm_shared_workspace \ No newline at end of file + - ./cdr/cdm/jupyter/cdm-postgres:/var/lib/postgresql/data \ No newline at end of file diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh index 35ab63c..c5bf6ee 100644 --- a/scripts/entrypoint.sh +++ b/scripts/entrypoint.sh @@ -1,5 +1,7 @@ #!/bin/bash +/opt/scripts/setup.sh + if [ "$SPARK_MODE" = "notebook" ]; then exec /opt/scripts/notebook_entrypoint.sh "$@" else diff --git a/scripts/notebook_entrypoint.sh b/scripts/notebook_entrypoint.sh index 920ccb2..ad8b0b1 100644 --- a/scripts/notebook_entrypoint.sh +++ b/scripts/notebook_entrypoint.sh @@ -2,38 +2,13 @@ echo "starting jupyter notebook" -source /opt/bitnami/scripts/spark-env.sh -if [ -z "$SPARK_CONF_FILE" ]; then - echo "Error: unable to find SPARK_CONF_FILE path" - exit 1 -fi - -# Set Spark configurations -{ - # Set dynamic allocation configurations to allow parallel job executions - if [ -z "$MAX_EXECUTORS" ]; then - # If MAX_EXECUTORS is not set, default to 5. Adjust as needed. - MAX_EXECUTORS=5 - fi - echo "spark.dynamicAllocation.enabled true" - echo "spark.dynamicAllocation.minExecutors 1" - echo "spark.dynamicAllocation.maxExecutors $MAX_EXECUTORS" - - # Set spark.driver.host if SPARK_DRIVER_HOST is set - if [ -n "$SPARK_DRIVER_HOST" ]; then - echo "spark.driver.host $SPARK_DRIVER_HOST" - fi -} >> "$SPARK_CONF_FILE" - -WORKSPACE_DIR="/cdm_shared_workspace" -mkdir -p "$WORKSPACE_DIR" -cd "$WORKSPACE_DIR" +cd "$CDM_SHARED_DIR" # Start Jupyter Lab jupyter lab --ip=0.0.0.0 \ --port="$NOTEBOOK_PORT" \ --no-browser \ --allow-root \ - --notebook-dir="$WORKSPACE_DIR" \ + --notebook-dir="$CDM_SHARED_DIR" \ --ServerApp.token='' \ --ServerApp.password='' \ No newline at end of file diff --git a/scripts/setup.sh b/scripts/setup.sh new file mode 100644 index 0000000..c4a57d3 --- /dev/null +++ b/scripts/setup.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# This script sets up the Spark environment variables and configurations for Spark master, worker, and driver (Jupyter) nodes. + +# Load Spark environment variables +source /opt/bitnami/scripts/spark-env.sh +if [ -z "$SPARK_CONF_FILE" ]; then + echo "Error: unable to find SPARK_CONF_FILE path" + exit 1 +fi + +# Set Spark configurations +{ + # Set dynamic allocation configurations to allow parallel job executions + if [ -z "$MAX_EXECUTORS" ]; then + # If MAX_EXECUTORS is not set, default to 5. Adjust as needed. + MAX_EXECUTORS=5 + fi + echo "spark.dynamicAllocation.enabled true" + echo "spark.dynamicAllocation.minExecutors 1" + echo "spark.dynamicAllocation.maxExecutors $MAX_EXECUTORS" + + # Set spark.driver.host if SPARK_DRIVER_HOST is set + if [ -n "$SPARK_DRIVER_HOST" ]; then + echo "spark.driver.host $SPARK_DRIVER_HOST" + fi +} >> "$SPARK_CONF_FILE" + +# Config hive-site.xml for Hive support +sed -e "s|{{POSTGRES_URL}}|${POSTGRES_URL}|g" \ + -e "s|{{POSTGRES_DB}}|${POSTGRES_DB}|g" \ + -e "s|{{POSTGRES_USER}}|${POSTGRES_USER}|g" \ + -e "s|{{POSTGRES_PASSWORD}}|${POSTGRES_PASSWORD}|g" \ + /opt/config/hive-site-template.xml > "$SPARK_HOME"/conf/hive-site.xml