generated from snowplow-incubator/dbt-template
-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Prepare Ecommerce codebase for Spark ( Iceberg ) Support (#54)
- Loading branch information
Showing
29 changed files
with
595 additions
and
94 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
FROM openjdk:11-jre-slim | ||
|
||
# Set environment variables | ||
ENV SPARK_VERSION=3.5.1 | ||
ENV HADOOP_VERSION=3.3.4 | ||
ENV ICEBERG_VERSION=1.4.2 | ||
ENV AWS_SDK_VERSION=1.12.581 | ||
|
||
# Install necessary tools | ||
RUN apt-get update && apt-get install -y curl wget procps rsync ssh | ||
|
||
# Download and install Spark | ||
RUN wget https://downloads.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz && \ | ||
tar -xvzf spark-${SPARK_VERSION}-bin-hadoop3.tgz && \ | ||
mv spark-${SPARK_VERSION}-bin-hadoop3 /spark && \ | ||
rm spark-${SPARK_VERSION}-bin-hadoop3.tgz | ||
|
||
# Set Spark environment variables | ||
ENV SPARK_HOME=/spark | ||
ENV PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin | ||
|
||
# Download necessary JARs | ||
RUN mkdir -p /spark/jars && \ | ||
wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/${ICEBERG_VERSION}/iceberg-spark-runtime-3.5_2.12-${ICEBERG_VERSION}.jar -O /spark/jars/iceberg-spark-runtime.jar && \ | ||
wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar -O /spark/jars/iceberg-aws-bundle.jar && \ | ||
wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -O /spark/jars/hadoop-aws.jar && \ | ||
wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar -O /spark/jars/aws-java-sdk-bundle.jar | ||
|
||
# Create directory for Spark events | ||
RUN mkdir -p /tmp/spark-events | ||
|
||
WORKDIR /spark | ||
|
||
CMD ["bash"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
#!/bin/bash | ||
|
||
# Set variables | ||
DOCKER_HUB_ORG="snowplow" | ||
IMAGE_NAME="spark-s3-iceberg" | ||
TAG="latest" | ||
|
||
# Build the image | ||
echo "Building Docker image..." | ||
docker build --platform linux/amd64 -t $DOCKER_HUB_ORG/$IMAGE_NAME:$TAG . | ||
|
||
# Log in to Docker Hub | ||
echo "Logging in to Docker Hub..." | ||
docker login | ||
|
||
# Push the image to Docker Hub | ||
echo "Pushing image to Docker Hub..." | ||
docker push $DOCKER_HUB_ORG/$IMAGE_NAME:$TAG | ||
|
||
echo "Image successfully built and pushed to Docker Hub" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
version: '3' | ||
|
||
networks: | ||
spark-network: | ||
driver: bridge | ||
|
||
services: | ||
spark-master: | ||
image: snowplow/spark-s3-iceberg:latest | ||
command: ["/bin/bash", "-c", "/spark/sbin/start-master.sh -h spark-master --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.master.Master-1-*.out"] | ||
hostname: spark-master | ||
ports: | ||
- '8080:8080' | ||
- '7077:7077' | ||
environment: | ||
- SPARK_LOCAL_IP=spark-master | ||
- SPARK_MASTER_HOST=spark-master | ||
- SPARK_MASTER_PORT=7077 | ||
- SPARK_MASTER_OPTS="-Dspark.driver.memory=2g" | ||
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} | ||
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} | ||
- AWS_REGION=eu-west-1 | ||
- AWS_DEFAULT_REGION=eu-west-1 | ||
volumes: | ||
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf | ||
networks: | ||
- spark-network | ||
|
||
spark-worker: | ||
image: snowplow/spark-s3-iceberg:latest | ||
command: ["/bin/bash", "-c", "sleep 10 && /spark/sbin/start-worker.sh spark://spark-master:7077 --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.worker.Worker-*.out"] | ||
depends_on: | ||
- spark-master | ||
environment: | ||
- SPARK_WORKER_CORES=2 | ||
- SPARK_WORKER_MEMORY=4G | ||
- SPARK_EXECUTOR_MEMORY=3G | ||
- SPARK_LOCAL_IP=spark-worker | ||
- SPARK_MASTER=spark://spark-master:7077 | ||
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} | ||
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} | ||
- AWS_REGION=eu-west-1 | ||
- AWS_DEFAULT_REGION=eu-west-1 | ||
volumes: | ||
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf | ||
networks: | ||
- spark-network | ||
|
||
thrift-server: | ||
image: snowplow/spark-s3-iceberg:latest | ||
command: ["/bin/bash", "-c", "sleep 30 && /spark/sbin/start-thriftserver.sh --master spark://spark-master:7077 --driver-memory 2g --executor-memory 3g --hiveconf hive.server2.thrift.port=10000 --hiveconf hive.server2.thrift.bind.host=0.0.0.0 --conf spark.sql.hive.thriftServer.async=true --conf spark.sql.hive.thriftServer.workerQueue.size=2000 --conf spark.sql.hive.thriftServer.maxWorkerThreads=100 --conf spark.sql.hive.thriftServer.minWorkerThreads=50 && tail -f /spark/logs/spark--org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-*.out"] | ||
ports: | ||
- '10000:10000' | ||
depends_on: | ||
- spark-master | ||
- spark-worker | ||
environment: | ||
- SPARK_LOCAL_IP=thrift-server | ||
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} | ||
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} | ||
- AWS_REGION=eu-west-1 | ||
- AWS_DEFAULT_REGION=eu-west-1 | ||
volumes: | ||
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf | ||
networks: | ||
- spark-network |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
spark.master spark://spark-master:7077 | ||
|
||
spark.sql.warehouse.dir s3a://dbt-spark-iceberg/github-integration-testing | ||
spark.sql.catalog.glue org.apache.iceberg.spark.SparkCatalog | ||
spark.sql.catalog.glue.catalog-impl org.apache.iceberg.aws.glue.GlueCatalog | ||
spark.sql.catalog.glue.warehouse s3a://dbt-spark-iceberg/github-integration-testing | ||
spark.sql.catalog.glue.io-impl org.apache.iceberg.aws.s3.S3FileIO | ||
spark.sql.defaultCatalog glue | ||
spark.sql.catalog.glue.database dbt-spark-iceberg | ||
|
||
spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem | ||
spark.hadoop.fs.s3a.access.key <AWS_ACCESS_KEY_ID> | ||
spark.hadoop.fs.s3a.secret.key <AWS_SECRET_ACCESS_KEY> | ||
spark.hadoop.fs.s3a.endpoint s3.eu-west-1.amazonaws.com | ||
spark.hadoop.fs.s3a.path.style.access true | ||
spark.hadoop.fs.s3a.region eu-west-1 | ||
spark.hadoop.fs.s3a.aws.region eu-west-1 | ||
|
||
# Enabling AWS SDK V4 signing (required for regions launched after January 2014) | ||
spark.hadoop.com.amazonaws.services.s3.enableV4 true | ||
spark.hadoop.fs.s3a.aws.credentials.provider org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider | ||
|
||
# Hive Metastore Configuration (using AWS Glue) | ||
spark.hadoop.hive.metastore.client.factory.class com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory | ||
|
||
# Thrift Server Configuration for better performance in concurrent environments | ||
spark.sql.hive.thriftServer.singleSession false | ||
spark.sql.hive.thriftServer.async true | ||
# spark.sql.hive.thriftServer.maxWorkerThreads 100 | ||
# spark.sql.hive.thriftServer.minWorkerThreads 50 | ||
# spark.sql.hive.thriftServer.workerQueue.size 2000 | ||
|
||
# Memory and Performance Tuning | ||
# spark.driver.memory 2g | ||
# spark.executor.memory 3g | ||
# spark.worker.memory 4g | ||
spark.network.timeout 600s | ||
spark.sql.broadcastTimeout 600s | ||
spark.sql.adaptive.enabled true | ||
spark.serializer org.apache.spark.serializer.KryoSerializer | ||
|
||
# Logging and Debugging | ||
spark.eventLog.enabled true | ||
spark.eventLog.dir /tmp/spark-events |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.