-
Notifications
You must be signed in to change notification settings - Fork 61
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add instructions to build a docker for GraphStorm-wholegraph on AWS #475
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
#!/bin/bash | ||
|
||
# process argument 1: graphstorm home folder | ||
if [ -z "$1" ]; then | ||
echo "Please provide the graphstorm home folder that the graphstorm codes are cloned to." | ||
echo "For example, ./build_docker_oss4local.sh /graph-storm/" | ||
exit 1 | ||
else | ||
GSF_HOME="$1" | ||
fi | ||
|
||
# process argument 2: docker image name, default is graphstorm | ||
if [ -z "$2" ]; then | ||
IMAGE_NAME="graphstorm-wholegraph" | ||
else | ||
IMAGE_NAME="$2" | ||
fi | ||
|
||
# process argument 3: image's tag name, default is local | ||
if [ -z "$3" ]; then | ||
TAG="local" | ||
else | ||
TAG="$3" | ||
fi | ||
|
||
# Copy scripts and tools codes to the docker folder | ||
mkdir -p $GSF_HOME"/docker/code" | ||
cp -r $GSF_HOME"/python" $GSF_HOME"/docker/code/python" | ||
cp -r $GSF_HOME"/inference_scripts" $GSF_HOME"/docker/code/inference_scripts" | ||
cp -r $GSF_HOME"/tools" $GSF_HOME"/docker/code/tools" | ||
cp -r $GSF_HOME"/training_scripts" $GSF_HOME"/docker/code/training_scripts" | ||
|
||
# Build OSS docker for EC2 instances that an pull ECR docker images | ||
DOCKER_FULLNAME="${IMAGE_NAME}:${TAG}" | ||
|
||
echo "Build a local docker image ${DOCKER_FULLNAME}" | ||
docker build --no-cache -f $GSF_HOME"/docker/wholegraph/Dockerfile" . -t $DOCKER_FULLNAME | ||
|
||
# remove the temporary code folder | ||
rm -rf $GSF_HOME"/docker/code" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
# Docker file for building a docker image for running GraphStorm code on Amazon SageMaker | ||
# Note: Distributed graph partition will use another docker image which will come soon. | ||
|
||
ARG DEVICE=gpu | ||
|
||
FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.0.1-gpu-py310-cu118-ubuntu20.04-ec2 | ||
ENV dev_type=GPU | ||
# Install DGL GPU version | ||
RUN pip3 install dgl==1.0.4+cu117 -f https://data.dgl.ai/wheels/cu117/repo.html && rm -rf /root/.cache | ||
|
||
LABEL maintainer="Amazon AI Graph ML team" | ||
|
||
# Install related Python packages | ||
RUN pip3 install ogb==1.3.6 scipy pyarrow boto3 scikit-learn transformers | ||
|
||
# Install other dependencies | ||
RUN apt-get update | ||
RUN apt-get install -y apt-transport-https | ||
RUN apt-get install -y cython3 libicu-dev | ||
RUN pip3 install h5py psutil | ||
|
||
RUN apt-get install -y unzip | ||
|
||
# Install GraphStorm from source code | ||
RUN mkdir -p /graphstorm | ||
COPY code/python/graphstorm /graphstorm/python/graphstorm | ||
ENV PYTHONPATH="/graphstorm/python/:${PYTHONPATH}" | ||
|
||
# Copy GraphStorm scripts and tools | ||
COPY code/inference_scripts /graphstorm/inference_scripts | ||
COPY code/tools /graphstorm/tools | ||
COPY code/training_scripts /graphstorm/training_scripts | ||
|
||
RUN ls /usr/local/cuda* | ||
|
||
################################################# | ||
## Install NCCL | ||
RUN apt-get clean | ||
RUN apt-get reinstall ca-certificates | ||
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/libnccl2_2.15.1-1+cuda11.8_amd64.deb | ||
RUN dpkg -i libnccl2_2.15.1-1+cuda11.8_amd64.deb | ||
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/libnccl-dev_2.15.1-1+cuda11.8_amd64.deb | ||
RUN dpkg -i libnccl-dev_2.15.1-1+cuda11.8_amd64.deb | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can also follow step 6 from the EC2 doc to install NCCL. |
||
|
||
################################################# | ||
## Install EFA installer | ||
ARG EFA_INSTALLER_VERSION=latest | ||
RUN cd $HOME \ | ||
&& curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ | ||
&& tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ | ||
&& cd aws-efa-installer \ | ||
&& apt-get update \ | ||
&& apt-get install -y libhwloc-dev \ | ||
&& ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
|
||
|
||
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub | ||
RUN apt-key add 3bf863cc.pub | ||
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin | ||
RUN mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 | ||
RUN apt --fix-broken -y install | ||
RUN apt update | ||
RUN apt-get install -y software-properties-common | ||
RUN add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" | ||
RUN apt update | ||
RUN apt install -y cuda-11-8 | ||
RUN apt install -y cuda-libraries-dev-11-8 | ||
|
||
################################################### | ||
## Install AWS-OFI-NCCL plugin | ||
ARG AWS_OFI_NCCL_VERSION=v1.7.1-aws | ||
RUN git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \ | ||
&& cd /opt/aws-ofi-nccl \ | ||
&& git checkout ${AWS_OFI_NCCL_VERSION} \ | ||
&& ./autogen.sh \ | ||
&& ./configure --prefix=/opt/aws-ofi-nccl/ \ | ||
--with-libfabric=/opt/amazon/efa/ \ | ||
--with-cuda=/usr/local/cuda-11.8/ \ | ||
&& make && make install | ||
|
||
ENV PATH "/opt/amazon/efa/bin:$PATH" | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need install NCCL test (Step 8) to verify the EFA+NCCL setup. |
||
# Install WholeGraph | ||
RUN apt install -y ninja-build | ||
RUN apt-get remove -y cmake | ||
RUN wget https://github.com/Kitware/CMake/releases/download/v3.27.6/cmake-3.27.6-linux-x86_64.sh | ||
RUN mv cmake-3.27.6-linux-x86_64.sh /opt/cmake-3.27.6-linux-x86_64.sh \ | ||
&& cd /opt \ | ||
&& chmod +x cmake-3.27.6-linux-x86_64.sh \ | ||
&& yes | bash cmake-3.27.6-linux-x86_64.sh | ||
RUN pip3 install cmake==3.27.5 | ||
|
||
COPY wholegraph/install-wholegraph.sh install-wg.sh | ||
RUN bash install-wg.sh | ||
|
||
# Increase nofile limit | ||
RUN echo "root soft nofile 1048576" >> /etc/security/limits.conf \ | ||
&& echo "root hard nofile 1048576" >> /etc/security/limits.conf | ||
|
||
# Set up SSH | ||
RUN apt-get install -y openssh-client openssh-server | ||
ENV SSH_PORT=2222 | ||
RUN cat /etc/ssh/sshd_config > /tmp/sshd_config && \ | ||
sed "0,/^#Port 22/s//Port ${SSH_PORT}/" /tmp/sshd_config > /etc/ssh/sshd_config | ||
ENV SSHDIR $HOME/.ssh | ||
RUN mkdir -p ${SSHDIR} | ||
RUN ssh-keygen -t rsa -f ${SSHDIR}/id_rsa -N '' | ||
RUN cp ${SSHDIR}/id_rsa.pub ${SSHDIR}/authorized_keys | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to modify |
||
EXPOSE 2222 | ||
CMD ["/usr/sbin/sshd", "-D"] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#!/bin/bash | ||
which cmake | ||
cmake --version | ||
|
||
|
||
git clone https://github.com/fmtlib/fmt.git /opt/fmt | ||
cd /opt/fmt | ||
git checkout 9.1.0 | ||
mkdir build && cd build | ||
cmake -DCMAKE_POSITION_INDEPENDENT_CODE=TRUE .. | ||
make | ||
make install | ||
|
||
git clone https://github.com/gabime/spdlog.git /opt/spdlog | ||
cd /opt/spdlog && mkdir build && cd build | ||
cmake .. && make -j | ||
cp libspdlog.a /usr/lib/libspdlog.a | ||
export PYTHON=/usr/bin/python | ||
|
||
mkdir /opt/rapids/ | ||
cd /opt/rapids/ | ||
git clone https://github.com/rapidsai/wholegraph.git -b refactoring | ||
cd /opt/rapids/wholegraph/ | ||
pip install --upgrade pip | ||
pip install scikit-build | ||
export WHOLEGRAPH_CMAKE_CUDA_ARCHITECTURES="70-real;80-real;90" | ||
# fix a bug in CMakeList.txt when build pylibwholegraph | ||
old="import sysconfig; print(sysconfig.get_config_var('BINLIBDEST'))" | ||
string="import sysconfig; print(\"%s/%s\" % (sysconfig.get_config_var(\"LIBDIR\"), sysconfig.get_config_var(\"INSTSONAME\")))" | ||
sed -i "s|$old|$string|" /opt/rapids/wholegraph/python/pylibwholegraph/CMakeLists.txt | ||
PYTHON=python3 bash build.sh libwholegraph pylibwholegraph -v |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We need to fix python installation. By default docker is using conda.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure if it has to do with python installation but I had to add: