Skip to content

Commit

Permalink
Merge branch 'main' into gsp_process_time_out
Browse files Browse the repository at this point in the history
  • Loading branch information
jalencato authored Dec 20, 2024
2 parents e4d1e21 + de94bb2 commit f1d1c0f
Show file tree
Hide file tree
Showing 13 changed files with 4,461 additions and 84 deletions.
182 changes: 182 additions & 0 deletions docker/build_graphstorm_image.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
#!/usr/bin/env bash

set -Eeuo pipefail
trap cleanup SIGINT SIGTERM ERR EXIT

SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd -P)

usage() {
cat <<EOF
Usage: $(basename "${BASH_SOURCE[0]}") [-h] [-x] -e sagemaker
Builds the GraphStorm training/inference Docker images.
Available options:
-h, --help Print this help and exit
-x, --verbose Print script debug info (set -x)
-e, --environment Image execution environment. Must be one of 'local' or 'sagemaker'. Required.
-d, --device Device type, must be one of 'cpu' or 'gpu'. Default is 'gpu'.
-p, --path Path to graphstorm root directory, default is one level above this script's location.
-i, --image Docker image name, default is 'graphstorm'.
-s, --suffix Suffix for the image tag, can be used to push custom image tags. Default is "<environment>-<device>".
-b, --build Docker build directory prefix, default is '/tmp/graphstorm-build/docker'.
Example:
bash $(basename "${BASH_SOURCE[0]}") -e sagemaker --device cpu
# Will build an image tagged as 'graphstorm:sagemaker-cpu'
EOF
exit
}

msg() {
echo >&2 -e "${1-}"
}

die() {
local msg=$1
local code=${2-1} # default exit status 1
msg "$msg"
exit "$code"
}

parse_params() {
# default values of variables set from params
DEVICE_TYPE="gpu"
GSF_HOME="${SCRIPT_DIR}/../"
IMAGE_NAME='graphstorm'
BUILD_DIR='/tmp/graphstorm-build/docker'
SUFFIX=""

while :; do
case "${1-}" in
-h | --help) usage ;;
-x | --verbose) set -x ;;
-e | --environment)
EXEC_ENV="${2-}"
shift
;;
-d | --device)
DEVICE_TYPE="${2-}"
shift
;;
-p | --path)
GSF_HOME="${2-}"
shift
;;
-b | --build)
BUILD_DIR="${2-}"
shift
;;
-i | --image)
IMAGE_NAME="${2-}"
shift
;;
-s | --suffix)
SUFFIX="${2-}"
shift
;;
-?*) die "Unknown option: $1" ;;
*) break ;;
esac
shift
done

# check required params and arguments
[[ -z "${EXEC_ENV-}" ]] && die "Missing required parameter: -e/--environment [local|sagemaker]"

return 0
}

cleanup() {
trap - SIGINT SIGTERM ERR EXIT
# script cleanup here
if [[ ${BUILD_DIR} ]]; then
rm -rf "${BUILD_DIR}/docker/code"
fi
}

parse_params "$@"

if [[ ${EXEC_ENV} == "local" || ${EXEC_ENV} == "sagemaker" ]]; then
: # Do nothing
else
die "--environment parameter needs to be one of 'local' or 'sagemaker', got ${EXEC_ENV}"
fi

# Print build parameters
msg "Execution parameters:"
msg "- EXECUTION ENVIRONMENT: ${EXEC_ENV}"
msg "- DEVICE_TYPE: ${DEVICE_TYPE}"
msg "- GSF_HOME: ${GSF_HOME}"
msg "- IMAGE_NAME: ${IMAGE_NAME}"
msg "- SUFFIX: ${SUFFIX}"

# Prepare Docker build directory
if [[ -d ${BUILD_DIR} ]]; then
rm -rf "${BUILD_DIR}"
fi
mkdir -p "${BUILD_DIR}"

# Authenticate to ECR to be able to pull source SageMaker or public.ecr.aws image
msg "Authenticating to public ECR registry"
if [[ ${EXEC_ENV} == "sagemaker" ]]; then
# Pulling SageMaker image, login to public SageMaker ECR registry
aws ecr get-login-password --region us-east-1 |
docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com
else
# Pulling local image, login to Amazon ECR Public Gallery
aws ecr-public get-login-password --region us-east-1 |
docker login --username AWS --password-stdin public.ecr.aws
fi

# Prepare Docker build directory
CODE_DIR="${BUILD_DIR}/code"
mkdir -p "${CODE_DIR}"
# TODO: After deprecating the old build scripts, the code copying commands
# can be merged for both local and sagemaker environments, but will
# need Dockerfile changes to support both.


# Set image name
DOCKER_FULLNAME="${IMAGE_NAME}:${EXEC_ENV}-${DEVICE_TYPE}${SUFFIX}"

if [[ $EXEC_ENV = "local" ]]; then

cp "$SCRIPT_DIR/local/fetch_and_run.sh" "$CODE_DIR"
cp -r "$GSF_HOME/python" "${CODE_DIR}/python"
cp -r "$GSF_HOME/examples" "${CODE_DIR}/examples"
cp -r "$GSF_HOME/inference_scripts" "${CODE_DIR}/inference_scripts"
cp -r "$GSF_HOME/tools" "${CODE_DIR}/tools"
cp -r "$GSF_HOME/training_scripts" "${CODE_DIR}/training_scripts"

DOCKERFILE="${GSF_HOME}/docker/local/Dockerfile.local"

if [[ $DEVICE_TYPE = "gpu" ]]; then
SOURCE_IMAGE="nvidia/cuda:12.1.1-runtime-ubuntu22.04"
else
SOURCE_IMAGE="public.ecr.aws/ubuntu/ubuntu:22.04_stable"
fi

elif [[ $EXEC_ENV = "sagemaker" ]]; then
DOCKERFILE="${GSF_HOME}/docker/sagemaker/Dockerfile.sm"
rsync -a --exclude="*.pyc" --exclude="*.pyo" --exclude="*.pyd" \
"${GSF_HOME}/python" "$CODE_DIR/graphstorm/"
cp -r "${GSF_HOME}/sagemaker" "$CODE_DIR/graphstorm/sagemaker"
cp -r "${GSF_HOME}/docker/sagemaker/build_artifacts" "$BUILD_DIR"

if [[ $DEVICE_TYPE = "gpu" ]]; then
SOURCE_IMAGE="763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.3.0-gpu-py311-cu121-ubuntu20.04-sagemaker"
elif [[ $DEVICE_TYPE = "cpu" ]]; then
SOURCE_IMAGE="763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.3.0-cpu-py311-ubuntu20.04-sagemaker"
fi
fi

# Use Buildkit to avoid pulling both CPU and GPU images
echo "Building Docker image: ${DOCKER_FULLNAME}"
DOCKER_BUILDKIT=1 docker build \
--build-arg DEVICE="$DEVICE_TYPE" \
--build-arg SOURCE="${SOURCE_IMAGE}" \
-f "$DOCKERFILE" "${BUILD_DIR}" -t "$DOCKER_FULLNAME"
12 changes: 12 additions & 0 deletions docker/parmetis/Dockerfile.parmetis
Original file line number Diff line number Diff line change
Expand Up @@ -109,4 +109,16 @@ RUN mkdir -p ${SSHDIR} \

EXPOSE ${SSH_PORT}

COPY code/fetch_and_run.sh /graphstorm/fetch_and_run.sh

RUN apt update && apt install -y --no-install-recommends \
curl \
unzip

# Install aws-cli
RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" \
&& unzip awscliv2.zip \
&& ./aws/install


CMD ["/usr/sbin/sshd", "-D"]
144 changes: 144 additions & 0 deletions docker/push_graphstorm_image.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#!/usr/bin/env bash
set -Eeuo pipefail
trap cleanup SIGINT SIGTERM ERR EXIT

usage() {
cat <<EOF
Usage: $(basename "${BASH_SOURCE[0]}") [-h] [-x] -e/--environment [sagemaker|local] [--region ...] [--account ...]
Pushes GSProcessing image to ECR.
Available options:
-h, --help Print this help and exit
-x, --verbose Print script debug info (set -x)
-e, --environment Image execution environment. Must be one of 'local' or 'sagemaker'. Required.
-d, --device Device type. Must be one of 'gpu' or 'cpu'. Default is 'gpu'.
-i, --image Docker image name, default is 'graphstorm'.
-s, --suffix Suffix for the image tag, can be used to push custom image tags. Default tag is "<environment>-<device>".
-r, --region AWS Region to which we'll push the image. By default will get from aws-cli configuration.
-a, --account AWS Account ID. By default will get from aws-cli configuration.
Example:
bash $(basename "${BASH_SOURCE[0]}") -e sagemaker --device cpu --account 123456789012 --region us-east-1
# Will push an image to '123456789012.dkr.ecr.us-east-1.amazonaws.com/graphstorm:sagemaker-cpu'
EOF
exit
}

msg() {
echo >&2 -e "${1-}"
}

die() {
local msg=$1
local code=${2-1} # default exit status 1
msg "$msg"
exit "$code"
}

parse_params() {
# default values of variables set from params
DEVICE_TYPE="gpu"
IMAGE_NAME='graphstorm'
SUFFIX=""
REGION=$(aws configure get region) || REGION=""
REGION=${REGION:-us-east=1}
ACCOUNT=$(aws sts get-caller-identity --query Account --output text)

while :; do
case "${1-}" in
-h | --help) usage ;;
-x | --verbose) set -x ;;
-e | --environment)
EXEC_ENV="${2-}"
shift
;;
-i | --image)
IMAGE_NAME="${2-}"
shift
;;
-d | --device)
DEVICE_TYPE="${2-}"
shift
;;
-s | --suffix)
SUFFIX="${2-}"
shift
;;
-r | --region)
REGION="${2-}"
shift
;;
-a | --account)
ACCOUNT="${2-}"
shift
;;
-?*) die "Unknown option: $1" ;;
*) break ;;
esac
shift
done

[[ -z "${EXEC_ENV-}" ]] && die "Missing required parameter: -e/--environment [local|sagemaker]"

return 0
}

cleanup() {
trap - SIGINT SIGTERM ERR EXIT
# script cleanup here
rm -f /tmp/ecr_error
}

parse_params "${@}"

if [[ ${EXEC_ENV} == "sagemaker" || ${EXEC_ENV} == "local" ]]; then
: # Do nothing
else
die "--environment parameter needs to be one of 'sagemaker', or 'local' got ${EXEC_ENV}"
fi

TAG="${EXEC_ENV}-${DEVICE_TYPE}${SUFFIX}"
IMAGE="${IMAGE_NAME}"

msg "Execution parameters: "
msg "- ENVIRONMENT: ${EXEC_ENV}"
msg "- DEVICE TYPE: ${DEVICE_TYPE}"
msg "- IMAGE: ${IMAGE}"
msg "- TAG: ${TAG}"
msg "- REGION: ${REGION}"
msg "- ACCOUNT: ${ACCOUNT}"

FULLNAME="${ACCOUNT}.dkr.ecr.${REGION}.amazonaws.com/${IMAGE}:${TAG}"

# If the repository doesn't exist in ECR, create it.
echo "Getting or creating container repository: ${IMAGE}"
if ! eval aws ecr describe-repositories --repository-names "${IMAGE}" --region ${REGION} >/dev/null 2>&1; then
msg "WARNING: ECR repository ${IMAGE} does not exist in region ${REGION}. Attempting to create..."

if ! aws ecr create-repository --repository-name "${IMAGE}" --region ${REGION} 2>/tmp/ecr_error; then
error_msg=$(cat /tmp/ecr_error)
if echo "$error_msg" | grep -q "AccessDeniedException"; then
msg "ERROR: You don't have sufficient permissions to create ECR repository"
msg "Required permission: ecr:CreateRepository"
exit 1
else
msg "ERROR: Failed to create ECR repository: ${error_msg}"
exit 1
fi
fi
msg "Successfully created ECR repository ${IMAGE}"
fi

msg "Logging into ECR with local credentials"
aws ecr get-login-password --region ${REGION} |
docker login --username AWS --password-stdin ${ACCOUNT}.dkr.ecr.${REGION}.amazonaws.com

msg "Pushing image to ${FULLNAME}"

docker tag "${IMAGE}:${TAG}" "${FULLNAME}"

docker push "${FULLNAME}"
5 changes: 3 additions & 2 deletions docker/sagemaker/Dockerfile.sm
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@

ARG DEVICE=gpu
ARG DGL_VERSION=2.3.0
ARG SOURCE

FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.3.0-gpu-py311-cu121-ubuntu20.04-sagemaker as branch-gpu
FROM ${SOURCE:-763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.3.0-gpu-py311-cu121-ubuntu20.04-sagemaker} as branch-gpu
ENV dev_type=GPU
ARG DGL_VERSION
# Install DGL GPU version
RUN pip3 install dgl==${DGL_VERSION}+cu121 -f https://data.dgl.ai/wheels/torch-2.3/cu121/repo.html && rm -rf /root/.cache

FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.3.0-cpu-py311-ubuntu20.04-sagemaker as branch-cpu
FROM ${SOURCE:-763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.3.0-cpu-py311-ubuntu20.04-sagemaker} as branch-cpu
ENV dev_type=CPU
ARG DGL_VERSION
# Install DGL CPU version
Expand Down
Loading

0 comments on commit f1d1c0f

Please sign in to comment.