Skip to content

Commit

Permalink
Testing apt remove step.
Browse files Browse the repository at this point in the history
This is to shrink down the final image (DEVOPS-2015).
  • Loading branch information
jsfillman authored Aug 24, 2024
1 parent 80f34d4 commit 357ccc5
Showing 1 changed file with 36 additions and 40 deletions.
76 changes: 36 additions & 40 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,63 +1,51 @@
# Base image for building dependencies
FROM bitnami/spark:3.5.1 AS build-stage
FROM bitnami/spark:3.5.1

# Switch to root to install packages
# https://github.com/bitnami/containers/tree/main/bitnami/spark#installing-additional-jars
USER root

# Create a non-root user
# User 1001 is not defined in /etc/passwd in the bitnami/spark image, causing various issues.
# References:
# https://github.com/bitnami/containers/issues/52698
# https://github.com/bitnami/containers/pull/52661
RUN groupadd -r spark && useradd -r -g spark spark_user

# Install necessary build tools
# Install necessary build tools and dependencies
RUN apt-get update && apt-get install -y \
gcc curl git graphviz graphviz-dev python3-pip \
gcc curl git graphviz graphviz-dev \
&& rm -rf /var/lib/apt/lists/*

# Set up environment variables
ENV HADOOP_AWS_VER=3.3.4
# NOTE: ensure Delta Spark jar version matches python pip delta-spark version specified in the Pipfile
ENV DELTA_SPARK_VER=3.2.0
ENV SCALA_VER=2.12
ENV POSTGRES_JDBC_VER=42.2.23

# Copy build files and run Gradle to download JARs
# Run Gradle task to download JARs to /gradle/gradle_jars location
COPY build.gradle settings.gradle gradlew /gradle/
COPY gradle /gradle/gradle
ENV GRADLE_JARS_DIR=gradle_jars
RUN /gradle/gradlew -p /gradle build
RUN cp -r /gradle/${GRADLE_JARS_DIR}/* /opt/bitnami/spark/jars/

# Make an empty yarn conf dir to prevent spark from complaining
RUN mkdir -p /opt/yarn/conf && chown -R spark_user:spark /opt/yarn
ENV YARN_CONF_DIR=/opt/yarn/conf

# Install pipenv and Python dependencies
RUN pip3 install pipenv
COPY Pipfile* ./
RUN pipenv sync --system

# Copy necessary files for the final stage
COPY ./src/ /src
COPY ./scripts/ /opt/scripts/
COPY ./config/ /opt/config/

# Create the runtime stage, based on the same image but without the build tools
FROM bitnami/spark:3.5.1 AS runtime-stage

# Switch to root to copy files and set permissions
USER root

# Create the same non-root user as in the build stage
RUN groupadd -r spark && useradd -r -g spark spark_user

# Copy pre-built JARs from the build stage
COPY --from=build-stage /gradle/gradle_jars/* /opt/bitnami/spark/jars/

# Copy Python environment and installed packages
COPY --from=build-stage /usr/local/lib/python3.*/dist-packages /usr/local/lib/python3.*/dist-packages
COPY --from=build-stage /usr/local/bin/pipenv /usr/local/bin/pipenv
# Remove unnecessary build tools and clean up
RUN apt-get remove -y \
gcc graphviz-dev \
&& apt-get autoremove -y \
&& rm -rf /var/lib/apt/lists/*

# Copy application code, scripts, and config files
COPY --from=build-stage /src /src
COPY --from=build-stage /opt/scripts /opt/scripts
COPY --from=build-stage /opt/config /opt/config

# Make an empty yarn conf dir to prevent spark from complaining
RUN mkdir -p /opt/yarn/conf && chown -R spark_user:spark /opt/yarn
ENV YARN_CONF_DIR=/opt/yarn/conf
# Ensure correct ownership for non-root user
RUN chown -R spark_user:spark /opt/bitnami

# Set up Jupyter directories
ENV JUPYTER_CONFIG_DIR=/.jupyter
Expand All @@ -66,21 +54,29 @@ ENV JUPYTER_DATA_DIR=/.jupyter/data
RUN mkdir -p ${JUPYTER_CONFIG_DIR} ${JUPYTER_RUNTIME_DIR} ${JUPYTER_DATA_DIR}
RUN chown -R spark_user:spark /.jupyter

# Set PYTHONPATH and other environment variables
# Copy application code and scripts
COPY ./src/ /src
ENV PYTHONPATH "${PYTHONPATH}:/src"

# Copy the startup script to the default profile location to automatically load pre-built functions in Jupyter Notebook
COPY --from=build-stage /src/notebook_utils/startup.py /.ipython/profile_default/startup/
COPY ./src/notebook_utils/startup.py /.ipython/profile_default/startup/
RUN chown -R spark_user:spark /.ipython

# Set up the shared directory between Spark components
# Copy and set up scripts
COPY ./scripts/ /opt/scripts/
RUN chmod a+x /opt/scripts/*.sh

# Copy the configuration files
COPY ./config/ /opt/config/

# Ensure correct ownership for all files
RUN chown -R spark_user:spark /src /opt/scripts /opt/config

# Set up shared directory between Spark components
ENV CDM_SHARED_DIR=/cdm_shared_workspace
RUN mkdir -p ${CDM_SHARED_DIR} && chmod -R 777 ${CDM_SHARED_DIR}
RUN chown -R spark_user:spark $CDM_SHARED_DIR

# Set correct permissions for non-root user
RUN chown -R spark_user:spark /opt/bitnami /src /opt/scripts /opt/config

# Switch back to non-root user
USER spark_user

Expand Down

0 comments on commit 357ccc5

Please sign in to comment.