Skip to content

Commit

Permalink
add delta spark packages to support delta lake
Browse files Browse the repository at this point in the history
  • Loading branch information
Tianhao-Gu committed May 22, 2024
1 parent bcc0e28 commit 12dda6e
Show file tree
Hide file tree
Showing 7 changed files with 283 additions and 32 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.DS_Store
.idea
.coverage
*_pycache__
*_pycache__
cdm_shared_workspace/
22 changes: 17 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,23 @@ USER root

RUN apt-get update && apt-get install -y \
# GCC required to resolve error during JupyterLab installation: psutil could not be installed from sources because gcc is not installed.
gcc \
gcc curl \
&& rm -rf /var/lib/apt/lists/*

# Install jars to support delta lake spark operations
ENV HADOOP_AWS_VER=3.3.4
RUN curl -O https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_AWS_VER}/hadoop-aws-${HADOOP_AWS_VER}.jar \
&& mv hadoop-aws-${HADOOP_AWS_VER}.jar /opt/bitnami/spark/jars

# NOTE: ensure Delta Spark jars matche python pip delta-spark version specified in the Pipfile
ENV DELTA_SPARK_VER=3.2.0
ENV SCALA_VER=2.12
RUN curl -O https://repo1.maven.org/maven2/io/delta/delta-spark_${SCALA_VER}/${DELTA_SPARK_VER}/delta-spark_${SCALA_VER}-${DELTA_SPARK_VER}.jar \
&& mv delta-spark_${SCALA_VER}-${DELTA_SPARK_VER}.jar /opt/bitnami/spark/jars

Run curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/${DELTA_SPARK_VER}/delta-storage-${DELTA_SPARK_VER}.jar \
&& mv delta-storage-${DELTA_SPARK_VER}.jar /opt/bitnami/spark/jars

# install pipenv
RUN pip3 install pipenv

Expand All @@ -19,10 +33,8 @@ RUN pipenv sync --system
COPY ./src/ /src
ENV PYTHONPATH "${PYTHONPATH}:/src"

COPY scripts/entrypoint.sh /opt/
RUN chmod a+x /opt/entrypoint.sh
COPY ./scripts/ /opt/scripts/
RUN chmod a+x /opt/scripts/*

# Switch back to the original user
USER ${ORI_USER}

ENTRYPOINT ["/opt/entrypoint.sh"]
3 changes: 3 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ name = "pypi"
jupyterlab= "==4.2.0"
pyspark= "==3.5.1"
boto3 = "==1.34.109"
minio = "==7.2.7"
delta-spark = "==3.2.0" # should match JAR version (DELTA_SPARK_VER) specified in the Dockerfile
pandas = "==2.2.2"

[dev-packages]
pytest = "==8.2.1"
Expand Down
188 changes: 175 additions & 13 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 12dda6e

Please sign in to comment.