Merge branch 'main' into parmetis

awslabs · May 3, 2024 · 31d525b · 31d525b
2 parents 71b373b + 6054339
commit 31d525b
Show file tree

Hide file tree

Showing 86 changed files with 5,349 additions and 2,694 deletions.
diff --git a/.github/workflow_scripts/lint_check.sh b/.github/workflow_scripts/lint_check.sh
@@ -5,7 +5,6 @@ set -ex
 
 python3 -m pip install --upgrade prospector pip
 yes | pip3 install astroid==v3.0.0
-FORCE_CUDA=1 python3 -m pip install -e '.[test]'  --no-build-isolation
 pylint --rcfile=./tests/lint/pylintrc ./python/graphstorm/data/*.py
 pylint --rcfile=./tests/lint/pylintrc ./python/graphstorm/distributed/
 pylint --rcfile=./tests/lint/pylintrc ./python/graphstorm/dataloading/

diff --git a/docker/Dockerfile.local b/docker/Dockerfile.local
@@ -1,34 +1,76 @@
-FROM nvidia/cuda:11.3.1-devel-ubuntu20.04
+ARG DEVICE=gpu
+ARG SOURCE
+
+FROM ${SOURCE} as base
+
 ENV DEBIAN_FRONTEND=noninteractive
 ENV HOME=/root
 
-RUN apt-get update
-RUN apt-get install -y --no-install-recommends build-essential python3-dev make g++ vim net-tools
+ARG DGL_VERSION=1.1.3
+ARG OGB_VERSION=1.3.6
+ARG TORCH_VERSION=2.1.2
+ARG TRANSFORMERS_VERSION=4.28.1
 
-RUN apt-get install -y python3-pip git wget psmisc
-RUN apt-get install -y cmake
+RUN apt update && apt install -y --no-install-recommends \
+    git \
+    libicu-dev \
+    openssh-client \
+    openssh-server \
+    python3.9 \
+    python3.9-distutils \
+    python3.9-venv \
+    && rm -rf /var/lib/apt/lists/*
 
-# Install Pytorch
-RUN pip3 install networkx==3.1 pydantic
-RUN pip3 install torch==2.1.0+cu118 --extra-index-url https://download.pytorch.org/whl/cu118
+# Create and activate a Python venv
+RUN python3.9 -m venv /opt/gs-venv
+ENV PATH="/opt/gs-venv/bin:$PATH"
 
-# Install DGL
-RUN pip3 install dgl==1.0.4+cu117 -f https://data.dgl.ai/wheels/cu117/repo.html
-ENV PYTHONPATH="/root/dgl/tools/:${PYTHONPATH}"
+# Install GraphStorm dependencies
+RUN pip install \
+    boto3==1.34.89 \
+    botocore==1.34.89 \
+    h5py==3.11.0 \
+    networkx==3.1 \
+    psutil==5.9.8 \
+    pyarrow==14.0.0 \
+    pydantic==2.7.0 \
+    scikit-learn==1.4.2 \
+    scipy==1.13.0 \
+    && rm -rf /root/.cache
+
+FROM base as base-cpu
+
+# Install torch, DGL, and GSF deps that require torch
+RUN pip install \
+    torch==${TORCH_VERSION} \
+    --index-url https://download.pytorch.org/whl/cpu \
+    && rm -rf /root/.cache
 
-# Install related Python packages
-RUN pip3 install ogb==1.3.6 scipy pyarrow boto3 scikit-learn transformers
+RUN pip install \
+    dgl==${DGL_VERSION} \
+    ogb==${OGB_VERSION} \
+    transformers==${TRANSFORMERS_VERSION} \
+    -f https://data.dgl.ai/wheels-internal/repo.html && rm -rf /root/.cache
 
-# Install other dependencies
-RUN apt-get install -y cython3 libicu-dev
-RUN pip3 install h5py psutil
+FROM base as base-gpu
 
-RUN apt-get install -y unzip
+# Install torch, DGL, and GSF deps that require torch
+RUN pip install \
+    dgl==${DGL_VERSION}+cu121 \
+    ogb==${OGB_VERSION} \
+    torch==${TORCH_VERSION} \
+    transformers==${TRANSFORMERS_VERSION} \
+    -f https://data.dgl.ai/wheels/cu121/repo.html \
+    && rm -rf /root/.cache
+
+FROM base-${DEVICE} as runtime
+
+ENV PYTHONPATH="/root/dgl/tools/:${PYTHONPATH}"
 
 # Download DGL source code
-RUN cd /root; git clone --recursive https://github.com/dmlc/dgl.git
+RUN cd /root; git clone --single-branch https://github.com/dmlc/dgl.git
 
-# Install GraphStorm from source code
+# Copy GraphStorm source and add to PYTHONPATH
 RUN mkdir -p /graphstorm
 COPY code/python/graphstorm /graphstorm/python/graphstorm
 ENV PYTHONPATH="/graphstorm/python/:${PYTHONPATH}"
@@ -39,16 +81,17 @@ COPY code/inference_scripts /graphstorm/inference_scripts
 COPY code/tools /graphstorm/tools
 COPY code/training_scripts /graphstorm/training_scripts
 
-# Set up SSH
-RUN apt-get install -y openssh-client openssh-server
+# Set up SSH access
 ENV SSH_PORT=2222
+
 RUN cat /etc/ssh/sshd_config > /tmp/sshd_config && \
     sed "0,/^#Port 22/s//Port ${SSH_PORT}/" /tmp/sshd_config > /etc/ssh/sshd_config
 ENV SSHDIR $HOME/.ssh
-RUN mkdir -p ${SSHDIR}
-RUN ssh-keygen -t rsa -f ${SSHDIR}/id_rsa -N ''
-RUN cp ${SSHDIR}/id_rsa.pub ${SSHDIR}/authorized_keys
+RUN mkdir -p ${SSHDIR} \
+    && ssh-keygen -t rsa -f ${SSHDIR}/id_rsa -N '' \
+    && cp ${SSHDIR}/id_rsa.pub ${SSHDIR}/authorized_keys \
+    && mkdir /run/sshd
+
+EXPOSE ${SSH_PORT}
 
-EXPOSE 2222
-RUN mkdir /run/sshd
 CMD ["/usr/sbin/sshd", "-D"]
diff --git a/docker/build_docker_oss4local.sh b/docker/build_docker_oss4local.sh
@@ -1,9 +1,10 @@
 #!/bin/bash
+set -eox pipefail
 
 # process argument 1: graphstorm home folder
 if [ -z "$1" ]; then
-    echo "Please provide the graphstorm home folder that the graphstorm codes are cloned to."
-    echo "For example, ./build_docker_oss4local.sh /graph-storm/"
+    echo "Please provide a path to the root directory of the GraphStorm repository."
+    echo "For example, ./build_docker_oss4local.sh ../ graphstorm local gpu"
     exit 1
 else
     GSF_HOME="$1"
@@ -23,6 +24,13 @@ else
     TAG="$3"
 fi
 
+# process argument 4: docker image type, default is GPU
+if [ -z "$4" ]; then
+    DEVICE_TYPE="gpu"
+else
+    DEVICE_TYPE="$4"
+fi
+
 # Copy scripts and tools codes to the docker folder
 mkdir -p $GSF_HOME"/docker/code"
 cp -r $GSF_HOME"/python" $GSF_HOME"/docker/code/python"
@@ -31,11 +39,30 @@ cp -r $GSF_HOME"/inference_scripts" $GSF_HOME"/docker/code/inference_scripts"
 cp -r $GSF_HOME"/tools" $GSF_HOME"/docker/code/tools"
 cp -r $GSF_HOME"/training_scripts" $GSF_HOME"/docker/code/training_scripts"
 
+aws ecr-public get-login-password --region us-east-1 | \
+    docker login --username AWS --password-stdin public.ecr.aws
+
 # Build OSS docker for EC2 instances that an pull ECR docker images
-DOCKER_FULLNAME="${IMAGE_NAME}:${TAG}"
+DOCKER_FULLNAME="${IMAGE_NAME}:${TAG}-${DEVICE_TYPE}"
 
 echo "Build a local docker image ${DOCKER_FULLNAME}"
-docker build --no-cache -f $GSF_HOME"/docker/Dockerfile.local" . -t $DOCKER_FULLNAME
+
+if [[ $DEVICE_TYPE = "gpu" ]]; then
+    SOURCE_IMAGE="nvidia/cuda:12.1.0-runtime-ubuntu20.04"
+elif [[ $DEVICE_TYPE = "cpu" ]]; then
+    SOURCE_IMAGE="public.ecr.aws/ubuntu/ubuntu:20.04_stable"
+else
+    echo >&2 -e "Image type can only be \"gpu\" or \"cpu\", but got \""$DEVICE_TYPE"\""
+    # remove the temporary code folder
+    rm -rf code
+    exit 1
+fi
+
+# Use Buildkit to avoid pulling both CPU and GPU images
+DOCKER_BUILDKIT=1 docker build \
+    --build-arg DEVICE=$DEVICE_TYPE \
+    --build-arg SOURCE=${SOURCE_IMAGE} \
+    -f "${GSF_HOME}/docker/Dockerfile.local" . -t $DOCKER_FULLNAME
 
 # remove the temporary code folder
 rm -rf $GSF_HOME"/docker/code"
diff --git a/docker/build_docker_sagemaker.sh b/docker/build_docker_sagemaker.sh
@@ -12,9 +12,9 @@ fi
 
 # process argument 2: docker image type, default is GPU
 if [ -z "$2" ]; then
-    IMAGE_TYPE="gpu"
+    DEVICE_TYPE="gpu"
 else
-    IMAGE_TYPE="$2"
+    DEVICE_TYPE="$2"
 fi
 
 # process argument 3: docker image name, default is graphstorm
@@ -39,20 +39,20 @@ cp -r "${GSF_HOME}/sagemaker" code/graphstorm/sagemaker
 cp -r "${GSF_HOME}/docker/sagemaker/build_artifacts" build_artifacts
 
 # Build OSS docker for EC2 instances that an pull ECR docker images
-DOCKER_FULLNAME="${IMAGE_NAME}:${TAG}"
+DOCKER_FULLNAME="${IMAGE_NAME}:${TAG}-${DEVICE_TYPE}"
 
 echo "Build a sagemaker docker image ${DOCKER_FULLNAME}"
 
 # Log in to ECR to pull Docker image
 aws ecr get-login-password --region us-east-1 \
         | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com
 
-if [ $IMAGE_TYPE = "gpu" ] || [ $IMAGE_TYPE = "cpu" ]; then
+if [ $DEVICE_TYPE = "gpu" ] || [ $DEVICE_TYPE = "cpu" ]; then
     # Use Buildkit to avoid pulling both CPU and GPU images
-    DOCKER_BUILDKIT=1 docker build --build-arg DEVICE=$IMAGE_TYPE \
+    DOCKER_BUILDKIT=1 docker build --build-arg DEVICE=$DEVICE_TYPE \
         -f "${GSF_HOME}/docker/sagemaker/Dockerfile.sm" . -t $DOCKER_FULLNAME
 else
-    echo "Image type can only be \"gpu\" or \"cpu\", but got \""$IMAGE_TYPE"\""
+    echo "Device type can only be \"gpu\" or \"cpu\", but got \""$DEVICE_TYPE"\""
     # remove the temporary code folder
     rm -rf code
     exit 1

diff --git a/docs/source/advanced/own-models.rst b/docs/source/advanced/own-models.rst
@@ -198,37 +198,50 @@ the ``ip_config`` argument specifies a ip configuration file, which contains the
 
 Replace DGL DataLoader with the GraphStorm's dataset and dataloader
 `````````````````````````````````````````````````````````````````````
-Because the GraphStorm uses distributed graphs, we need to first load the partitioned graph, which is created in the :ref:`Step 1 <step-1>`, with the `GSgnnNodeTrainData <https://github.com/awslabs/graphstorm/blob/main/python/graphstorm/dataloading/dataset.py#L469>`_ class (for edge tasks, GraphStorm also provides `GSgnnEdgeTrainData <https://github.com/awslabs/graphstorm/blob/main/python/graphstorm/dataloading/dataset.py#L216>`_). The ``GSgnnNodeTrainData`` could be created as shown in the code below.
+Because the GraphStorm uses distributed graphs, we need to first load the partitioned graph, which is created in the :ref:`Step 1 <step-1>`, with the `GSgnnData <https://github.com/awslabs/graphstorm/blob/main/python/graphstorm/dataloading/dataset.py#L57>`_ class (for edge tasks, the same class is used). The ``GSgnnData`` could be created as shown in the code below.
 
 .. code-block:: python
 
-    train_data = GSgnnNodeTrainData(config.graph_name,
-                                    config.part_config,
-                                    train_ntypes=config.target_ntype,
-                                    node_feat_field=node_feat_fields,
-                                    label_field=config.label_field)
+    train_data = GSgnnData(config.part_config)
 
-Arguments of this class include the partition configuration JSON file path, which are the outputs of the :ref:`Step 1 <step-1>`. The ``graph_name`` can be found in the JSON file.
+Arguments of this class include the partition configuration JSON file path, which are the outputs of the :ref:`Step 1 <step-1>`.
 
-The other values, the ``train_ntypes``, the ``label_field``, and the ``node_feat_field``, should be consistent with the values in the raw data :ref:`input configuration JSON <input-config>` defined in the :ref:`Step 1 <step-1>`. The ``train_ntypes`` is the ``node_type`` that has ``labels`` specified. The ``label_fields`` is the value specified in ``label_col`` of the ``train_ntype``. The ``node_feat_field`` is a dictionary, whose key is the values of ``node_type``, and value is the values of ``feature_name``.
-
-Then we can put this dataset into GraphStorm's `GSgnnNodeDataLoader <https://github.com/awslabs/graphstorm/blob/main/python/graphstorm/dataloading/dataloading.py#L544>`_, which is like:
+Then we can put this dataset into GraphStorm's `GSgnnNodeDataLoader <https://github.com/awslabs/graphstorm/blob/main/python/graphstorm/dataloading/dataloading.py#L1237>`_, which is like:
 
 .. code-block:: python
 
+    # Get train idx
+    train_idxs = train_data.get_node_train_set(config.target_ntype)
     # Define the GraphStorm train dataloader
-    dataloader = GSgnnNodeDataLoader(train_data, train_data.train_idxs, fanout=config.fanout,
-                                     batch_size=config.batch_size, device=device, train_task=True)
+    dataloader = GSgnnNodeDataLoader(train_data,
+                                     train_idxs, fanout=config.fanout,
+                                     batch_size=config.batch_size,
+                                     label_field=config.label_field,
+                                     node_feats=node_feat_fields,train_task=True)
+
     # Optional: Define the evaluation dataloader
-    eval_dataloader = GSgnnNodeDataLoader(train_data, train_data.val_idxs,fanout=config.fanout,
-                                          batch_size=config.eval_batch_size, device=device,
+    val_idxs = train_data.get_node_val_set(eval_ntype)
+    eval_dataloader = GSgnnNodeDataLoader(train_data,
+                                          val_idxs,
+                                          fanout=config.fanout,
+                                          batch_size=config.eval_batch_size,
+                                          label_field=config.label_field,
+                                          node_feats=node_feat_fields,
                                           train_task=False)
     # Optional: Define the evaluation dataloader
-    test_dataloader = GSgnnNodeDataLoader(train_data, train_data.test_idxs,fanout=config.fanout,
-                                          batch_size=config.eval_batch_size, device=device,
+    test_idxs = train_data.get_node_test_set(eval_ntype)
+    test_dataloader = GSgnnNodeDataLoader(train_data,
+                                          test_idxs,
+                                          fanout=config.fanout,
+                                          batch_size=config.eval_batch_size,
+                                          label_field=config.label_field,
+                                          node_feats=node_feat_fields,
                                           train_task=False)
 
-GraphStorm provides a set of dataloaders for different GML tasks. Here we deal with a node task, hence using the node dataloader, which takes the graph data created above as the first argument. The second argument is the label index that the GraphStorm dataset extracts from the graph as indicated in the target nodes' ``train_mask``, ``val_mask``, and ``test_mask``, which are automatically generated by GraphStorm graph construction tool with the specified ``split_pct`` field. The ``GSgnnNodeTrainData`` automatically extracts these indexes out and set its properties so that you can directly use them like ``graph_data.train_idxs`` and ``graph_data.val_idxs``, and ``graph_data.test_idxs``. The rest of arguments are similar to the common training flow, except that we set the ``train_task`` to be ``False`` for the evaluation and test dataloader.
+GraphStorm provides a set of dataloaders for different GML tasks. Here we deal with a node task, hence using the node dataloader, which takes the graph data created above as the first argument. The second argument is the label index that the GraphStorm dataset extracts from the graph as indicated in the target nodes' ``train_mask``, ``val_mask``, and ``test_mask``, which are automatically generated by GraphStorm graph construction tool with the specified ``split_pct`` field. The ``GSgnnData`` provides functions to get the indexes of train data, validation data and test data through ``get_node_train_set``, ``get_node_val_set`` and ``get_node_test_set``, respectively.
+The ``label_field`` is also required by the GSgnnNodeDataLoader to get the labels for model training and evaluation.
+The ``node_feats`` and ``edge_feats`` are optional to GSgnnNodeDataLoader, which define the node features and edge features, respectively, to be used for the task associated with the dataloader.
+The rest of arguments are similar to the common training flow, except that we set the ``train_task`` to be ``False`` for the evaluation and test dataloader.
 
 Use GraphStorm's model trainer to wrap your model and attach evaluator and task tracker to it
 ````````````````````````````````````````````````````````````````````````````````````````````````
@@ -263,13 +276,13 @@ The GraphStorm trainers can have evaluators and task trackers associated. The fo
 .. code-block:: python
 
     # Optional: set up a evaluator
-    evaluator = GSgnnAccEvaluator(config.eval_frequency,
-                                  config.eval_metric,
-                                  config.multilabel,
-                                  config.use_early_stop,
-                                  config.early_stop_burnin_rounds,
-                                  config.early_stop_rounds,
-                                  config.early_stop_strategy)
+    evaluator = GSgnnClassificationEvaluator(config.eval_frequency,
+                                             config.eval_metric,
+                                             config.multilabel,
+                                             config.use_early_stop,
+                                             config.early_stop_burnin_rounds,
+                                             config.early_stop_rounds,
+                                             config.early_stop_strategy)
     trainer.setup_evaluator(evaluator)
     # Optional: set up a task tracker to show the progress of training.
     tracker = GSSageMakerTaskTracker(config.eval_frequency)

diff --git a/docs/source/api/graphstorm.dataloading.rst b/docs/source/api/graphstorm.dataloading.rst
@@ -33,10 +33,7 @@ DataSets
     :nosignatures:
     :template: datasettemplate.rst
 
-    GSgnnNodeTrainData
-    GSgnnNodeInferData
-    GSgnnEdgeTrainData
-    GSgnnEdgeInferData
+    GSgnnData
 
 DataLoaders
 ------------

diff --git a/docs/source/api/graphstorm.eval.rst b/docs/source/api/graphstorm.eval.rst
@@ -7,8 +7,10 @@ graphstorm.eval
     Learning (GML) tasks.
 
     If users want to implement customized evaluators or evaluation methods, a best practice is to
-    extend base evaluators, i.e., the ``GSgnnInstanceEvaluator`` class for node or edge prediction
-    tasks, and ``GSgnnLPEvaluator`` for link prediction tasks, and then implement the abstract methods.
+    extend the base evaluator, i.e., the ``GSgnnBaseEvaluator``, and the corresponding evaluation
+    interfaces, e.g., ``GSgnnPredictionEvalInterface``` for prediction evaluation, and
+    ``GSgnnLPRankingEvalInterface`` for ranking based link prediction evaluation, and then
+    implement the abstract methods defined in those interface classes.
 
 .. currentmodule:: graphstorm.eval
 
@@ -20,8 +22,9 @@ Base Evaluators
     :nosignatures:
     :template: evaltemplate.rst
 
-    GSgnnInstanceEvaluator
-    GSgnnLPEvaluator
+    GSgnnBaseEvaluator
+    GSgnnPredictionEvalInterface
+    GSgnnLPRankingEvalInterface
 
 Evaluators
 -----------
@@ -31,8 +34,7 @@ Evaluators
     :nosignatures:
     :template: evaltemplate.rst
 
-    GSgnnLPEvaluator
+    GSgnnClassificationEvaluator
+    GSgnnRegressionEvaluator
     GSgnnMrrLPEvaluator
     GSgnnPerEtypeMrrLPEvaluator
-    GSgnnAccEvaluator
-    GSgnnRegressionEvaluator
diff --git a/docs/source/gs-processing/gs-processing-getting-started.rst b/docs/source/gs-processing/gs-processing-getting-started.rst
@@ -145,11 +145,12 @@ distributed training pipeline.
 Running on AWS resources
 ------------------------
 
-GSProcessing supports Amazon SageMaker and EMR Serverless as execution environments.
+GSProcessing supports Amazon SageMaker, EMR on EC2, and EMR Serverless as execution environments.
 To run distributed jobs on AWS resources we will have to build a Docker image
 and push it to the Amazon Elastic Container Registry, which we cover in
-:doc:`usage/distributed-processing-setup` and run a SageMaker Processing
-job which we describe in :doc:`usage/amazon-sagemaker`, or EMR Serverless
+:doc:`usage/distributed-processing-setup`. We can then run either a SageMaker Processing
+job which we describe in :doc:`usage/amazon-sagemaker`, an EMR on EC2 job which
+we describe in :doc:`usage/emr`, or an EMR Serverless
 job that is covered in :doc:`usage/emr-serverless`.