forked from NVIDIA/spark-rapids-container
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Dockerfile
112 lines (95 loc) · 5.22 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# A Dockerfile for building a container dedicated to run inside k8s cluster.
ARG CUDA_VER=11.5.0
ARG UBUNTU_VER=20.04
FROM nvidia/cuda:${CUDA_VER}-runtime-ubuntu${UBUNTU_VER}
# All build args with default values
ARG SPARK_UID=185
ARG SPARK_VER=3.3.1
ARG HADOOP_VER=3
ARG RAPIDS_VER=22.12.0
ARG HADOOP_AWS_VER=3.3.1
ARG AWS_JAVA_SDK_BUNDLE_VER=1.11.655
ARG GCS_CONNECTOR_VER=2.2.2
# For DeltaLake, strict version compatibility with SPARK_VER, see: https://docs.delta.io/latest/releases.html#compatibility-with-apache-spark
ARG DELTA_CORE_VER=2.2.0
ENV DEBIAN_FRONTEND=noninteractive
# https://forums.developer.nvidia.com/t/notice-cuda-linux-repository-key-rotation/212771
RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
# Install java dependencies
RUN apt-get update && apt-get install -y --no-install-recommends openjdk-8-jdk openjdk-8-jre gpg gpg-agent wget
ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64
ENV PATH $PATH:/usr/lib/jvm/java-1.8.0-openjdk-amd64/jre/bin:/usr/lib/jvm/java-1.8.0-openjdk-amd64/bin
# Install Spark with desired versions
WORKDIR /opt
RUN set -ex && \
wget -O spark-KEYS "https://downloads.apache.org/spark/KEYS" && \
gpg --import spark-KEYS && \
wget https://archive.apache.org/dist/spark/spark-$SPARK_VER/spark-$SPARK_VER-bin-hadoop$HADOOP_VER.tgz.asc && \
wget -O spark-$SPARK_VER-bin-hadoop$HADOOP_VER.tgz "https://www.apache.org/dyn/mirrors/mirrors.cgi?action=download&filename=spark/spark-$SPARK_VER/spark-$SPARK_VER-bin-hadoop$HADOOP_VER.tgz" && \
gpg --verify spark-$SPARK_VER-bin-hadoop$HADOOP_VER.tgz.asc spark-$SPARK_VER-bin-hadoop$HADOOP_VER.tgz && \
tar zxfv spark-$SPARK_VER-bin-hadoop$HADOOP_VER.tgz && \
ln -s spark-$SPARK_VER-bin-hadoop$HADOOP_VER spark
ENV PATH=/opt/spark/bin:$PATH
ENV PYTHONPATH /opt/spark/python/lib/pyspark.zip:/opt/spark/python/lib/py4j-0.10.9-src.zip:$PYTHONPATH
# Install rapids plugin jar
WORKDIR /opt/spark/jars
RUN set -ex && \
wget -O rapids-4-spark-KEYS https://keys.openpgp.org/vks/v1/by-fingerprint/7A8A39909B9B202410C2A26F1D9E1285654392EF && \
gpg --import rapids-4-spark-KEYS && \
wget https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/$RAPIDS_VER/rapids-4-spark_2.12-$RAPIDS_VER-cuda11.jar.asc && \
wget https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/$RAPIDS_VER/rapids-4-spark_2.12-$RAPIDS_VER-cuda11.jar && \
gpg --verify rapids-4-spark_2.12-$RAPIDS_VER-cuda11.jar.asc rapids-4-spark_2.12-$RAPIDS_VER-cuda11.jar
# Add necessary jars to support AWS s3 storage access
RUN set -ex && \
wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/$HADOOP_AWS_VER/hadoop-aws-$HADOOP_AWS_VER.jar && \
wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/$AWS_JAVA_SDK_BUNDLE_VER/aws-java-sdk-bundle-$AWS_JAVA_SDK_BUNDLE_VER.jar
# Add necessary jars to support GCS storage access
RUN set -ex && \
wget https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-$GCS_CONNECTOR_VER/gcs-connector-hadoop3-$GCS_CONNECTOR_VER-shaded.jar
# TODO: Add necessary jars to support Azure storage access
# TODO: Investigate running both pip and pip3 via virtualenvs
RUN apt-get update && \
apt install -y python3 python3-pip && \
# We remove ensurepip since it adds no functionality since pip is
# installed on the image and it just takes up 1.6MB on the image
pip install --upgrade pip setuptools && \
# You may install with python3 packages by using pip3.6
# Removed the .cache to save space
rm -r /root/.cache && rm -rf /var/cache/apt/*
ENV SPARK_HOME /opt/spark
# Add DeltaLake jar
RUN set -ex && \
wget https://repo1.maven.org/maven2/io/delta/delta-core_2.12/$DELTA_CORE_VER/delta-core_2.12-$DELTA_CORE_VER.jar && \
wget https://repo1.maven.org/maven2/io/delta/delta-storage/$DELTA_CORE_VER/delta-storage-$DELTA_CORE_VER.jar
RUN mkdir -p /opt/spark/work-dir
WORKDIR /opt/spark/work-dir
RUN chmod g+w /opt/spark/work-dir
ENV TINI_VERSION v0.18.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /usr/bin/tini
RUN chmod +rx /usr/bin/tini
RUN rm /bin/sh && \
ln -sv /bin/bash /bin/sh && \
echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \
chgrp root /etc/passwd && chmod ug+rw /etc/passwd
# always use python3 to launch pyspark, otherwise there will be python syntax error when
# python2 is used.
ENV PYSPARK_PYTHON=/usr/bin/python3
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 10
ENTRYPOINT [ "/opt/spark/kubernetes/dockerfiles/spark/entrypoint.sh" ]
# Specify the User that the actual main process will run as
USER ${SPARK_UID}