Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

using postgres as hive metastore #27

Merged
merged 3 commits into from
Jun 8, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ ENV HADOOP_AWS_VER=3.3.4
# NOTE: ensure Delta Spark jar version matches python pip delta-spark version specified in the Pipfile
ENV DELTA_SPARK_VER=3.2.0
ENV SCALA_VER=2.12
ENV POSTGRES_JDBC_VER=42.2.23

# Run Gradle task to download JARs to /gradle/gradle_jars location
COPY build.gradle settings.gradle gradlew /gradle/
Expand All @@ -37,6 +38,13 @@ COPY ./src/notebook/startup.py /.ipython/profile_default/startup/
COPY ./scripts/ /opt/scripts/
RUN chmod a+x /opt/scripts/*.sh

# Copy the configuration files
COPY ./config/ /opt/config/

# This is the shared directory between the spark master, worker and driver containers
ENV CDM_SHARED_DIR=/cdm_shared_workspace
RUN mkdir -p ${CDM_SHARED_DIR} && chmod -R 777 ${CDM_SHARED_DIR}

# Switch back to the original user
USER ${ORI_USER}

Expand Down
2 changes: 2 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,12 @@ repositories {
def hadoopAwsVersion = System.getenv('HADOOP_AWS_VER')
def deltaSparkVersion = System.getenv('DELTA_SPARK_VER')
def scalaVersion = System.getenv('SCALA_VER')
def postgresVersion = System.getenv('POSTGRES_JDBC_VER')

dependencies {
implementation "org.apache.hadoop:hadoop-aws:$hadoopAwsVersion"
implementation "io.delta:delta-spark_${scalaVersion}:$deltaSparkVersion"
implementation "org.postgresql:postgresql:$postgresVersion"
}

task downloadDependencies(type: Copy) {
Expand Down
47 changes: 47 additions & 0 deletions config/hive-site-template.xml
MrCreosote marked this conversation as resolved.
Show resolved Hide resolved
MrCreosote marked this conversation as resolved.
Show resolved Hide resolved
MrCreosote marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:postgresql://{{POSTGRES_URL}}/{{POSTGRES_DB}}</value>
</property>

<!-- JDBC driver class name for PostgreSQL -->
<!-- Ensure that PostgreSQL JDBC driver jars are included via Gradle -->
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>org.postgresql.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>{{POSTGRES_USER}}</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>{{POSTGRES_PASSWORD}}</value>
</property>

<!-- Configuration to automatically create the necessary tables in the database schema if they don't exist -->
<!-- The Hive metastore may fail to start if this property is unset and tables have not been created previously-->
<property>
<name>datanucleus.schema.autoCreateTables</name>
<value>{{DATANUCLEUS_AUTO_CREATE_TABLES}}</value>
</property>

<!-- Disable schema verification in the Hive metastore -->
<!-- Hive metastore fails to start if this property is not set-->
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>

<!-- Directory location for the Hive warehouse where table data is stored -->
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/cdm_shared_workspace/hive_metastore</value>
</property>

<!-- Enable support for concurrency in Hive, allowing multiple users to access and modify the data simultaneously -->
<property>
<name>hive.support.concurrency</name>
<value>true</value>
</property>
</configuration>
101 changes: 83 additions & 18 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,18 @@ services:
environment:
- SPARK_MODE=master
- SPARK_MASTER_WEBUI_PORT=8090
- MAX_EXECUTORS=4
- POSTGRES_USER=hive
- POSTGRES_PASSWORD=hivepassword
- POSTGRES_DB=hive
- POSTGRES_URL=postgres:5432
MrCreosote marked this conversation as resolved.
Show resolved Hide resolved
volumes:
- ./cdr/cdm/jupyter:/cdm_shared_workspace
MrCreosote marked this conversation as resolved.
Show resolved Hide resolved

spark-worker-1:
image: bitnami/spark:3.5.1
build:
context: .
dockerfile: Dockerfile
container_name: spark-worker-1
depends_on:
- spark-master
Expand All @@ -28,9 +37,17 @@ services:
- SPARK_WORKER_CORES=2
- SPARK_WORKER_MEMORY=1G
- SPARK_WORKER_WEBUI_PORT=8081
- POSTGRES_USER=hive
- POSTGRES_PASSWORD=hivepassword
- POSTGRES_DB=hive
- POSTGRES_URL=postgres:5432
volumes:
- ./cdr/cdm/jupyter:/cdm_shared_workspace

spark-worker-2:
image: bitnami/spark:3.5.1
build:
context: .
dockerfile: Dockerfile
container_name: spark-worker-2
depends_on:
- spark-master
Expand All @@ -42,14 +59,12 @@ services:
- SPARK_WORKER_CORES=2
- SPARK_WORKER_MEMORY=1G
- SPARK_WORKER_WEBUI_PORT=8082

spark-test-node:
image: bitnami/spark:3.5.1
container_name: spark-test-node
depends_on:
- spark-master
environment:
- SPARK_MASTER_URL=spark://spark-master:7077
- POSTGRES_USER=hive
- POSTGRES_PASSWORD=hivepassword
- POSTGRES_DB=hive
- POSTGRES_URL=postgres:5432
volumes:
- ./cdr/cdm/jupyter:/cdm_shared_workspace

minio:
image: minio/minio
Expand Down Expand Up @@ -78,18 +93,18 @@ services:
entrypoint: >
bash -c "
mc alias set minio http://minio:9002 minio minio123 &&
if ! mc ls minio/delta-lake 2>/dev/null; then
mc mb minio/delta-lake && echo 'Bucket delta-lake created'
if ! mc ls minio/cdm-lake 2>/dev/null; then
mc mb minio/cdm-lake && echo 'Bucket cdm-lake created'
else
echo 'bucket delta-lake already exists'
echo 'bucket cdm-lake already exists'
fi
"

notebook:
dev_notebook:
build:
context: .
dockerfile: Dockerfile
container_name: spark-notebook
container_name: spark-dev-notebook
ports:
- "4041:4041"
depends_on:
Expand All @@ -98,11 +113,61 @@ services:
environment:
- NOTEBOOK_PORT=4041
- SPARK_MASTER_URL=spark://spark-master:7077
- SPARK_DRIVER_HOST=spark-notebook
- SPARK_DRIVER_HOST=spark-dev-notebook
- MINIO_URL=http://minio:9002
- MINIO_ACCESS_KEY=minio
- MINIO_SECRET_KEY=minio123
- SPARK_MODE=notebook
- MAX_EXECUTORS=2
- MAX_EXECUTORS=4
- POSTGRES_USER=hive
- POSTGRES_PASSWORD=hivepassword
- POSTGRES_DB=hive
- POSTGRES_URL=postgres:5432
- USAGE_MODE=dev # Enabling dev mode grants full access to MinIO and additional privileges for services like Hive, such as the ability to create tables as defined in the scripts/setup.sh.
volumes:
- ./cdr/cdm/jupyter:/cdm_shared_workspace

user_notebook:
build:
context: .
dockerfile: Dockerfile
container_name: spark-user-notebook
ports:
- "4042:4042"
depends_on:
- spark-master
- minio-create-bucket
environment:
- NOTEBOOK_PORT=4042
- SPARK_MASTER_URL=spark://spark-master:7077
- SPARK_DRIVER_HOST=spark-user-notebook
- MINIO_URL=http://minio:9002
- MINIO_ACCESS_KEY=minio # TODO: create minIO user and policy for read-only access
- MINIO_SECRET_KEY=minio123
- SPARK_MODE=notebook
- MAX_EXECUTORS=4
- POSTGRES_USER=hive
- POSTGRES_PASSWORD=hivepassword
Tianhao-Gu marked this conversation as resolved.
Show resolved Hide resolved
- POSTGRES_DB=hive
- POSTGRES_URL=postgres:5432
volumes:
- ./cdr/cdm/jupyter:/cdm_shared_workspace

postgres:
image: postgres:16.3
restart: always
container_name: postgres
# To avoid incorrect user permissions, manually create the volume directory before running Docker.
# export UID=$(id -u)
# export GID=$(id -g)
# mkdir -p cdr/cdm/jupyter/cdm-postgres
# reference: https://forums.docker.com/t/systemd-coredump-taking-ownership-of-tmp-db-directory-and-contents-in-rails-app/93609
user: "${UID}:${GID}"
ports:
- "5432:5432"
environment:
- POSTGRES_USER=hive
- POSTGRES_PASSWORD=hivepassword
- POSTGRES_DB=hive
volumes:
- ./cdr/cdm/jupyter:/cdm_shared_workspace
- ./cdr/cdm/jupyter/cdm-postgres:/var/lib/postgresql/data # For local development only. In Rancher development, PostgreSQL data shouldn't be stored in a shared mount.
2 changes: 2 additions & 0 deletions scripts/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/bin/bash

/opt/scripts/setup.sh

if [ "$SPARK_MODE" = "notebook" ]; then
exec /opt/scripts/notebook_entrypoint.sh "$@"
else
Expand Down
29 changes: 2 additions & 27 deletions scripts/notebook_entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,13 @@

echo "starting jupyter notebook"

source /opt/bitnami/scripts/spark-env.sh
if [ -z "$SPARK_CONF_FILE" ]; then
echo "Error: unable to find SPARK_CONF_FILE path"
exit 1
fi

# Set Spark configurations
{
# Set dynamic allocation configurations to allow parallel job executions
if [ -z "$MAX_EXECUTORS" ]; then
# If MAX_EXECUTORS is not set, default to 5. Adjust as needed.
MAX_EXECUTORS=5
fi
echo "spark.dynamicAllocation.enabled true"
echo "spark.dynamicAllocation.minExecutors 1"
echo "spark.dynamicAllocation.maxExecutors $MAX_EXECUTORS"

# Set spark.driver.host if SPARK_DRIVER_HOST is set
if [ -n "$SPARK_DRIVER_HOST" ]; then
echo "spark.driver.host $SPARK_DRIVER_HOST"
fi
} >> "$SPARK_CONF_FILE"

WORKSPACE_DIR="/cdm_shared_workspace"
mkdir -p "$WORKSPACE_DIR"
cd "$WORKSPACE_DIR"
cd "$CDM_SHARED_DIR"

# Start Jupyter Lab
jupyter lab --ip=0.0.0.0 \
--port="$NOTEBOOK_PORT" \
--no-browser \
--allow-root \
--notebook-dir="$WORKSPACE_DIR" \
--notebook-dir="$CDM_SHARED_DIR" \
--ServerApp.token='' \
--ServerApp.password=''
57 changes: 57 additions & 0 deletions scripts/setup.sh
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is getting big enough we might want to consider porting it to python

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/bin/bash
MrCreosote marked this conversation as resolved.
Show resolved Hide resolved

# This script sets up the Spark environment variables and configurations for Spark master, worker, and driver (Jupyter) nodes.

# Load Spark environment variables
source /opt/bitnami/scripts/spark-env.sh
if [ -z "$SPARK_CONF_FILE" ]; then
echo "Error: unable to find SPARK_CONF_FILE path"
exit 1
fi

# Set Spark configurations
{
# Set dynamic allocation configurations to allow parallel job executions
if [ -z "$MAX_EXECUTORS" ]; then
# If MAX_EXECUTORS is not set, default to 5. Adjust as needed.
MAX_EXECUTORS=5
fi
echo "spark.dynamicAllocation.enabled true"
echo "spark.dynamicAllocation.minExecutors 1"
echo "spark.dynamicAllocation.maxExecutors $MAX_EXECUTORS"

# Set spark.driver.host if SPARK_DRIVER_HOST is set
if [ -n "$SPARK_DRIVER_HOST" ]; then
echo "spark.driver.host $SPARK_DRIVER_HOST"
fi
} >> "$SPARK_CONF_FILE"

# Config hive-site.xml for Hive support
sed -e "s|{{POSTGRES_URL}}|${POSTGRES_URL}|g" \
-e "s|{{POSTGRES_DB}}|${POSTGRES_DB}|g" \
-e "s|{{POSTGRES_USER}}|${POSTGRES_USER}|g" \
-e "s|{{POSTGRES_PASSWORD}}|${POSTGRES_PASSWORD}|g" \
/opt/config/hive-site-template.xml > "$SPARK_HOME"/conf/hive-site.xml


update_config() {
sed -i "s|{{DATANUCLEUS_AUTO_CREATE_TABLES}}|${DATANUCLEUS_AUTO_CREATE_TABLES}|g" "$SPARK_HOME"/conf/hive-site.xml
}

# Set settings based on server usage
set_environment() {
local lowercase_usage_mode=${USAGE_MODE,,} # Convert to lowercase

case "$lowercase_usage_mode" in
dev)
export DATANUCLEUS_AUTO_CREATE_TABLES=true
;;
*)
export DATANUCLEUS_AUTO_CREATE_TABLES=false
;;
esac
update_config
echo "Environment settings applied for $USAGE_MODE."
}

set_environment
Loading