Merge branch 'main' into gsp_process_time_out

awslabs · Dec 20, 2024 · f1d1c0f · f1d1c0f
2 parents e4d1e21 + de94bb2
commit f1d1c0f
Show file tree

Hide file tree

Showing 13 changed files with 4,461 additions and 84 deletions.
diff --git a/docker/build_graphstorm_image.sh b/docker/build_graphstorm_image.sh
@@ -0,0 +1,182 @@
+#!/usr/bin/env bash
+
+set -Eeuo pipefail
+trap cleanup SIGINT SIGTERM ERR EXIT
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd -P)
+
+usage() {
+    cat <<EOF
+Usage: $(basename "${BASH_SOURCE[0]}") [-h] [-x] -e sagemaker
+
+Builds the GraphStorm training/inference Docker images.
+
+Available options:
+
+-h, --help          Print this help and exit
+-x, --verbose       Print script debug info (set -x)
+-e, --environment   Image execution environment. Must be one of 'local' or 'sagemaker'. Required.
+-d, --device        Device type, must be one of 'cpu' or 'gpu'. Default is 'gpu'.
+-p, --path          Path to graphstorm root directory, default is one level above this script's location.
+-i, --image         Docker image name, default is 'graphstorm'.
+-s, --suffix        Suffix for the image tag, can be used to push custom image tags. Default is "<environment>-<device>".
+-b, --build         Docker build directory prefix, default is '/tmp/graphstorm-build/docker'.
+
+Example:
+
+    bash $(basename "${BASH_SOURCE[0]}") -e sagemaker --device cpu
+    # Will build an image tagged as 'graphstorm:sagemaker-cpu'
+
+EOF
+    exit
+}
+
+msg() {
+    echo >&2 -e "${1-}"
+}
+
+die() {
+    local msg=$1
+    local code=${2-1} # default exit status 1
+    msg "$msg"
+    exit "$code"
+}
+
+parse_params() {
+    # default values of variables set from params
+    DEVICE_TYPE="gpu"
+    GSF_HOME="${SCRIPT_DIR}/../"
+    IMAGE_NAME='graphstorm'
+    BUILD_DIR='/tmp/graphstorm-build/docker'
+    SUFFIX=""
+
+    while :; do
+        case "${1-}" in
+        -h | --help) usage ;;
+        -x | --verbose) set -x ;;
+        -e | --environment)
+            EXEC_ENV="${2-}"
+            shift
+            ;;
+        -d | --device)
+            DEVICE_TYPE="${2-}"
+            shift
+            ;;
+        -p | --path)
+            GSF_HOME="${2-}"
+            shift
+            ;;
+        -b | --build)
+            BUILD_DIR="${2-}"
+            shift
+            ;;
+        -i | --image)
+            IMAGE_NAME="${2-}"
+            shift
+            ;;
+        -s | --suffix)
+            SUFFIX="${2-}"
+            shift
+            ;;
+        -?*) die "Unknown option: $1" ;;
+        *) break ;;
+        esac
+        shift
+    done
+
+    # check required params and arguments
+    [[ -z "${EXEC_ENV-}" ]] && die "Missing required parameter: -e/--environment [local|sagemaker]"
+
+    return 0
+}
+
+cleanup() {
+    trap - SIGINT SIGTERM ERR EXIT
+    # script cleanup here
+    if [[ ${BUILD_DIR} ]]; then
+        rm -rf "${BUILD_DIR}/docker/code"
+    fi
+}
+
+parse_params "$@"
+
+if [[ ${EXEC_ENV} == "local" || ${EXEC_ENV} == "sagemaker" ]]; then
+    : # Do nothing
+else
+    die "--environment parameter needs to be one of 'local' or 'sagemaker', got ${EXEC_ENV}"
+fi
+
+# Print build parameters
+msg "Execution parameters:"
+msg "- EXECUTION ENVIRONMENT: ${EXEC_ENV}"
+msg "- DEVICE_TYPE: ${DEVICE_TYPE}"
+msg "- GSF_HOME: ${GSF_HOME}"
+msg "- IMAGE_NAME: ${IMAGE_NAME}"
+msg "- SUFFIX: ${SUFFIX}"
+
+# Prepare Docker build directory
+if [[ -d ${BUILD_DIR} ]]; then
+        rm -rf "${BUILD_DIR}"
+fi
+mkdir -p "${BUILD_DIR}"
+
+# Authenticate to ECR to be able to pull source SageMaker or public.ecr.aws image
+msg "Authenticating to public ECR registry"
+if [[ ${EXEC_ENV} == "sagemaker" ]]; then
+    # Pulling SageMaker image, login to public SageMaker ECR registry
+    aws ecr get-login-password --region us-east-1 |
+        docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com
+else
+    # Pulling local image, login to Amazon ECR Public Gallery
+    aws ecr-public get-login-password --region us-east-1 |
+        docker login --username AWS --password-stdin public.ecr.aws
+fi
+
+# Prepare Docker build directory
+CODE_DIR="${BUILD_DIR}/code"
+mkdir -p "${CODE_DIR}"
+# TODO: After deprecating the old build scripts, the code copying commands
+# can be merged for both local and sagemaker environments, but will
+# need Dockerfile changes to support both.
+
+
+# Set image name
+DOCKER_FULLNAME="${IMAGE_NAME}:${EXEC_ENV}-${DEVICE_TYPE}${SUFFIX}"
+
+if [[ $EXEC_ENV = "local" ]]; then
+
+    cp "$SCRIPT_DIR/local/fetch_and_run.sh" "$CODE_DIR"
+    cp -r "$GSF_HOME/python" "${CODE_DIR}/python"
+    cp -r "$GSF_HOME/examples" "${CODE_DIR}/examples"
+    cp -r "$GSF_HOME/inference_scripts" "${CODE_DIR}/inference_scripts"
+    cp -r "$GSF_HOME/tools" "${CODE_DIR}/tools"
+    cp -r "$GSF_HOME/training_scripts" "${CODE_DIR}/training_scripts"
+
+    DOCKERFILE="${GSF_HOME}/docker/local/Dockerfile.local"
+
+    if [[ $DEVICE_TYPE = "gpu" ]]; then
+        SOURCE_IMAGE="nvidia/cuda:12.1.1-runtime-ubuntu22.04"
+    else
+        SOURCE_IMAGE="public.ecr.aws/ubuntu/ubuntu:22.04_stable"
+    fi
+
+elif [[ $EXEC_ENV = "sagemaker" ]]; then
+    DOCKERFILE="${GSF_HOME}/docker/sagemaker/Dockerfile.sm"
+    rsync -a --exclude="*.pyc" --exclude="*.pyo" --exclude="*.pyd" \
+        "${GSF_HOME}/python" "$CODE_DIR/graphstorm/"
+    cp -r "${GSF_HOME}/sagemaker" "$CODE_DIR/graphstorm/sagemaker"
+    cp -r "${GSF_HOME}/docker/sagemaker/build_artifacts" "$BUILD_DIR"
+
+    if [[ $DEVICE_TYPE = "gpu" ]]; then
+        SOURCE_IMAGE="763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.3.0-gpu-py311-cu121-ubuntu20.04-sagemaker"
+    elif [[ $DEVICE_TYPE = "cpu" ]]; then
+        SOURCE_IMAGE="763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.3.0-cpu-py311-ubuntu20.04-sagemaker"
+    fi
+fi
+
+# Use Buildkit to avoid pulling both CPU and GPU images
+echo "Building Docker image: ${DOCKER_FULLNAME}"
+DOCKER_BUILDKIT=1 docker build \
+    --build-arg DEVICE="$DEVICE_TYPE" \
+    --build-arg SOURCE="${SOURCE_IMAGE}" \
+    -f "$DOCKERFILE" "${BUILD_DIR}" -t "$DOCKER_FULLNAME"
diff --git a/docker/parmetis/Dockerfile.parmetis b/docker/parmetis/Dockerfile.parmetis
@@ -109,4 +109,16 @@ RUN mkdir -p ${SSHDIR} \
 
 EXPOSE ${SSH_PORT}
 
+COPY code/fetch_and_run.sh /graphstorm/fetch_and_run.sh
+
+RUN apt update && apt install -y --no-install-recommends \
+    curl \
+    unzip
+
+# Install aws-cli
+RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" \
+    && unzip awscliv2.zip \
+    && ./aws/install
+
+
 CMD ["/usr/sbin/sshd", "-D"]
diff --git a/docker/push_graphstorm_image.sh b/docker/push_graphstorm_image.sh
@@ -0,0 +1,144 @@
+#!/usr/bin/env bash
+set -Eeuo pipefail
+trap cleanup SIGINT SIGTERM ERR EXIT
+
+usage() {
+    cat <<EOF
+Usage: $(basename "${BASH_SOURCE[0]}") [-h] [-x] -e/--environment [sagemaker|local] [--region ...] [--account ...]
+
+Pushes GSProcessing image to ECR.
+
+Available options:
+
+-h, --help          Print this help and exit
+-x, --verbose       Print script debug info (set -x)
+-e, --environment   Image execution environment. Must be one of 'local' or 'sagemaker'. Required.
+-d, --device        Device type. Must be one of 'gpu' or 'cpu'. Default is 'gpu'.
+-i, --image         Docker image name, default is 'graphstorm'.
+-s, --suffix        Suffix for the image tag, can be used to push custom image tags. Default tag is "<environment>-<device>".
+-r, --region        AWS Region to which we'll push the image. By default will get from aws-cli configuration.
+-a, --account       AWS Account ID. By default will get from aws-cli configuration.
+
+Example:
+
+    bash $(basename "${BASH_SOURCE[0]}") -e sagemaker --device cpu --account 123456789012 --region us-east-1
+    # Will push an image to '123456789012.dkr.ecr.us-east-1.amazonaws.com/graphstorm:sagemaker-cpu'
+
+EOF
+    exit
+}
+
+msg() {
+    echo >&2 -e "${1-}"
+}
+
+die() {
+    local msg=$1
+    local code=${2-1} # default exit status 1
+    msg "$msg"
+    exit "$code"
+}
+
+parse_params() {
+    # default values of variables set from params
+    DEVICE_TYPE="gpu"
+    IMAGE_NAME='graphstorm'
+    SUFFIX=""
+    REGION=$(aws configure get region) || REGION=""
+    REGION=${REGION:-us-east=1}
+    ACCOUNT=$(aws sts get-caller-identity --query Account --output text)
+
+    while :; do
+        case "${1-}" in
+        -h | --help) usage ;;
+        -x | --verbose) set -x ;;
+        -e | --environment)
+            EXEC_ENV="${2-}"
+            shift
+            ;;
+        -i | --image)
+            IMAGE_NAME="${2-}"
+            shift
+            ;;
+        -d | --device)
+            DEVICE_TYPE="${2-}"
+            shift
+            ;;
+        -s | --suffix)
+            SUFFIX="${2-}"
+            shift
+            ;;
+        -r | --region)
+            REGION="${2-}"
+            shift
+            ;;
+        -a | --account)
+            ACCOUNT="${2-}"
+            shift
+            ;;
+        -?*) die "Unknown option: $1" ;;
+        *) break ;;
+        esac
+        shift
+    done
+
+    [[ -z "${EXEC_ENV-}" ]] && die "Missing required parameter: -e/--environment [local|sagemaker]"
+
+    return 0
+}
+
+cleanup() {
+    trap - SIGINT SIGTERM ERR EXIT
+    # script cleanup here
+    rm -f /tmp/ecr_error
+}
+
+parse_params "${@}"
+
+if [[ ${EXEC_ENV} == "sagemaker" || ${EXEC_ENV} == "local" ]]; then
+    : # Do nothing
+else
+    die "--environment parameter needs to be one of 'sagemaker', or 'local' got ${EXEC_ENV}"
+fi
+
+TAG="${EXEC_ENV}-${DEVICE_TYPE}${SUFFIX}"
+IMAGE="${IMAGE_NAME}"
+
+msg "Execution parameters: "
+msg "- ENVIRONMENT: ${EXEC_ENV}"
+msg "- DEVICE TYPE: ${DEVICE_TYPE}"
+msg "- IMAGE: ${IMAGE}"
+msg "- TAG: ${TAG}"
+msg "- REGION: ${REGION}"
+msg "- ACCOUNT: ${ACCOUNT}"
+
+FULLNAME="${ACCOUNT}.dkr.ecr.${REGION}.amazonaws.com/${IMAGE}:${TAG}"
+
+# If the repository doesn't exist in ECR, create it.
+echo "Getting or creating container repository: ${IMAGE}"
+if ! eval aws ecr describe-repositories --repository-names "${IMAGE}" --region ${REGION} >/dev/null 2>&1; then
+    msg "WARNING: ECR repository ${IMAGE} does not exist in region ${REGION}. Attempting to create..."
+
+    if ! aws ecr create-repository --repository-name "${IMAGE}" --region ${REGION} 2>/tmp/ecr_error; then
+        error_msg=$(cat /tmp/ecr_error)
+        if echo "$error_msg" | grep -q "AccessDeniedException"; then
+            msg "ERROR: You don't have sufficient permissions to create ECR repository"
+            msg "Required permission: ecr:CreateRepository"
+            exit 1
+        else
+            msg "ERROR: Failed to create ECR repository: ${error_msg}"
+            exit 1
+        fi
+    fi
+    msg "Successfully created ECR repository ${IMAGE}"
+fi
+
+msg "Logging into ECR with local credentials"
+aws ecr get-login-password --region ${REGION} |
+    docker login --username AWS --password-stdin ${ACCOUNT}.dkr.ecr.${REGION}.amazonaws.com
+
+msg "Pushing image to ${FULLNAME}"
+
+docker tag "${IMAGE}:${TAG}" "${FULLNAME}"
+
+docker push "${FULLNAME}"
diff --git a/docker/sagemaker/Dockerfile.sm b/docker/sagemaker/Dockerfile.sm
@@ -3,14 +3,15 @@
 
 ARG DEVICE=gpu
 ARG DGL_VERSION=2.3.0
+ARG SOURCE
 
-FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.3.0-gpu-py311-cu121-ubuntu20.04-sagemaker as branch-gpu
+FROM ${SOURCE:-763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.3.0-gpu-py311-cu121-ubuntu20.04-sagemaker} as branch-gpu
 ENV dev_type=GPU
 ARG DGL_VERSION
 # Install DGL GPU version
 RUN pip3 install dgl==${DGL_VERSION}+cu121 -f https://data.dgl.ai/wheels/torch-2.3/cu121/repo.html && rm -rf /root/.cache
 
-FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.3.0-cpu-py311-ubuntu20.04-sagemaker as branch-cpu
+FROM ${SOURCE:-763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.3.0-cpu-py311-ubuntu20.04-sagemaker} as branch-cpu
 ENV dev_type=CPU
 ARG DGL_VERSION
 # Install DGL CPU version