Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Docker] Build docker for Parmetis Partition #784

Merged
merged 30 commits into from
May 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions docker/build_docker_parmetis.sh
thvasilo marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/bin/bash
set -eox pipefail

# process argument 1: graphstorm home folder
if [ -z "$1" ]; then
echo "Please provide a path to the root directory of the GraphStorm repository."
echo "For example, ./build_docker_parmetis.sh ../ graphstorm parmetis cpu"
exit 1
else
GSF_HOME="$1"
fi

# process argument 2: docker image name, default is graphstorm
if [ -z "$2" ]; then
IMAGE_NAME="graphstorm"
else
IMAGE_NAME="$2"
fi

# process argument 3: image's tag name, default is 'parmetis-cpu'
if [ -z "$3" ]; then
TAG="parmetis-cpu"
else
TAG="$3"
fi

# Copy scripts and tools codes to the docker folder
mkdir -p $GSF_HOME"/docker/code"
cp -r $GSF_HOME"/python" $GSF_HOME"/docker/code/python"
cp -r $GSF_HOME"/examples" $GSF_HOME"/docker/code/examples"
cp -r $GSF_HOME"/inference_scripts" $GSF_HOME"/docker/code/inference_scripts"
cp -r $GSF_HOME"/tools" $GSF_HOME"/docker/code/tools"
cp -r $GSF_HOME"/training_scripts" $GSF_HOME"/docker/code/training_scripts"

aws ecr-public get-login-password --region us-east-1 | \
docker login --username AWS --password-stdin public.ecr.aws

# Build OSS docker for EC2 instances that an pull ECR docker images
DOCKER_FULLNAME="${IMAGE_NAME}:${TAG}"

echo "Build a local docker image ${DOCKER_FULLNAME} for ParMETIS"

SOURCE_IMAGE="public.ecr.aws/ubuntu/ubuntu:20.04_stable"

DOCKER_BUILDKIT=1 docker build \
--build-arg SOURCE=${SOURCE_IMAGE} \
-f "${GSF_HOME}/docker/parmetis/Dockerfile.parmetis" . -t $DOCKER_FULLNAME

# remove the temporary code folder
rm -rf $GSF_HOME"/docker/code"
111 changes: 111 additions & 0 deletions docker/parmetis/Dockerfile.parmetis
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
ARG SOURCE

FROM ${SOURCE} as base

ENV DEBIAN_FRONTEND=noninteractive
ENV HOME=/root

ARG DGL_VERSION=1.1.3
ARG OGB_VERSION=1.3.6
ARG TORCH_VERSION=2.1.2
ARG TRANSFORMERS_VERSION=4.28.1

RUN apt update && apt install -y --no-install-recommends \
git \
libicu-dev \
openssh-client \
openssh-server \
python3.9 \
python3.9-distutils \
python3.9-venv \
gfortran \
cmake \
build-essential \
g++ \
vim \
wget \
&& rm -rf /var/lib/apt/lists/*
# Create and activate a Python venv
RUN python3.9 -m venv /opt/gs-venv
ENV PATH="/opt/gs-venv/bin:$PATH"

# Install GraphStorm dependencies
RUN pip install \
boto3==1.34.89 \
botocore==1.34.89 \
h5py==3.11.0 \
networkx==3.1 \
psutil==5.9.8 \
pyarrow==14.0.0 \
pydantic==2.7.0 \
scikit-learn==1.4.2 \
scipy==1.13.0 \
pyyaml \
&& rm -rf /root/.cache

# Install torch, DGL, and GSF deps that require torch
RUN pip install \
torch==${TORCH_VERSION} \
--index-url https://download.pytorch.org/whl/cpu \
&& rm -rf /root/.cache

RUN pip install \
dgl==${DGL_VERSION} \
ogb==${OGB_VERSION} \
transformers==${TRANSFORMERS_VERSION} \
-f https://data.dgl.ai/wheels-internal/repo.html && rm -rf /root/.cache

FROM base as runtime

ENV PYTHONPATH="/root/dgl/tools/:${PYTHONPATH}"

# Download DGL source code
RUN cd /root; git clone --single-branch --branch 2.2.x https://github.com/dmlc/dgl.git

# Copy GraphStorm source and add to PYTHONPATH
RUN mkdir -p /graphstorm
COPY code/python/graphstorm /graphstorm/python/graphstorm
ENV PYTHONPATH="/graphstorm/python/:${PYTHONPATH}"

# Copy GraphStorm scripts and tools
COPY code/examples /graphstorm/examples
COPY code/inference_scripts /graphstorm/inference_scripts
COPY code/tools /graphstorm/tools
COPY code/training_scripts /graphstorm/training_scripts

# Install GKLib
RUN cd /root; git clone --single-branch --branch master https://github.com/KarypisLab/GKlib; cd GKlib; make; make install

# Install Metis
RUN cd /root; git clone --single-branch --branch master https://github.com/KarypisLab/METIS.git; cd METIS; \
make config shared=1 cc=gcc prefix=/root/local i64=1; make install

# Install MPI
RUN wget https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.1.tar.gz && \
tar -xzvf openmpi-4.1.1.tar.gz && \
cd openmpi-4.1.1 && \
./configure --prefix=/usr/local && \
make all && \
make install && \
ldconfig && rm -rf mpich-4.1.1.tar.gz

# Install Parmetis
RUN cd /root; git clone --single-branch --branch main https://github.com/KarypisLab/PM4GNN.git; cd PM4GNN; \
make config cc=mpicc prefix=/root/local; make install
ENV PATH=$PATH:/root/local/bin
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/root/local/lib/

# Set up SSH access
ENV SSH_PORT=2222

RUN cat /etc/ssh/sshd_config > /tmp/sshd_config && \
sed "0,/^#Port 22/s//Port ${SSH_PORT}/" /tmp/sshd_config > /etc/ssh/sshd_config
ENV SSHDIR $HOME/.ssh
RUN mkdir -p ${SSHDIR} \
&& ssh-keygen -t rsa -f ${SSHDIR}/id_rsa -N '' \
&& cp ${SSHDIR}/id_rsa.pub ${SSHDIR}/authorized_keys \
&& mkdir /run/sshd

EXPOSE ${SSH_PORT}

CMD ["/usr/sbin/sshd", "-D"]
2 changes: 2 additions & 0 deletions python/graphstorm/gpartition/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,5 @@
Modules for local graph partitioning.
"""
from .random_partition import (RandomPartitionAlgorithm)
from .metis_partition import (ParMetisPartitionAlgorithm)
from .partition_config import (ParMETISConfig)
9 changes: 7 additions & 2 deletions python/graphstorm/gpartition/dist_partition_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
from typing import Dict
from threading import Thread

from graphstorm.gpartition import RandomPartitionAlgorithm
from graphstorm.gpartition import (ParMetisPartitionAlgorithm, ParMETISConfig,
RandomPartitionAlgorithm)
from graphstorm.utils import get_log_level


Expand Down Expand Up @@ -117,6 +118,10 @@ def main():
part_start = time.time()
if args.partition_algorithm == "random":
partitioner = RandomPartitionAlgorithm(metadata_dict)
elif args.partition_algorithm == "parmetis":
partition_config = ParMETISConfig(args.ip_list, args.input_path,
args.dgl_tool_path, args.metadata_filename)
partitioner = ParMetisPartitionAlgorithm(metadata_dict, partition_config)
else:
raise RuntimeError(f"Unknown partition algorithm {args.part_algorithm}")

Expand Down Expand Up @@ -161,7 +166,7 @@ def parse_args() -> argparse.Namespace:
argparser.add_argument("--dgl-tool-path", type=str,
help="The path to dgl/tools")
argparser.add_argument("--partition-algorithm", type=str, default="random",
choices=["random"], help="Partition algorithm to use.")
choices=["random", "parmetis"], help="Partition algorithm to use.")
argparser.add_argument("--ip-list", type=str,
help="A file storing the ip list of instances of the partition cluster.")
argparser.add_argument("--do-dispatch", action='store_true')
Expand Down
Loading
Loading