Skip to content

Commit

Permalink
Merge branch 'main' into parmetis
Browse files Browse the repository at this point in the history
  • Loading branch information
jalencato authored May 3, 2024
2 parents 71b373b + 6054339 commit 31d525b
Show file tree
Hide file tree
Showing 86 changed files with 5,349 additions and 2,694 deletions.
1 change: 0 additions & 1 deletion .github/workflow_scripts/lint_check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ set -ex

python3 -m pip install --upgrade prospector pip
yes | pip3 install astroid==v3.0.0
FORCE_CUDA=1 python3 -m pip install -e '.[test]' --no-build-isolation
pylint --rcfile=./tests/lint/pylintrc ./python/graphstorm/data/*.py
pylint --rcfile=./tests/lint/pylintrc ./python/graphstorm/distributed/
pylint --rcfile=./tests/lint/pylintrc ./python/graphstorm/dataloading/
Expand Down
95 changes: 69 additions & 26 deletions docker/Dockerfile.local
Original file line number Diff line number Diff line change
@@ -1,34 +1,76 @@
FROM nvidia/cuda:11.3.1-devel-ubuntu20.04
ARG DEVICE=gpu
ARG SOURCE

FROM ${SOURCE} as base

ENV DEBIAN_FRONTEND=noninteractive
ENV HOME=/root

RUN apt-get update
RUN apt-get install -y --no-install-recommends build-essential python3-dev make g++ vim net-tools
ARG DGL_VERSION=1.1.3
ARG OGB_VERSION=1.3.6
ARG TORCH_VERSION=2.1.2
ARG TRANSFORMERS_VERSION=4.28.1

RUN apt-get install -y python3-pip git wget psmisc
RUN apt-get install -y cmake
RUN apt update && apt install -y --no-install-recommends \
git \
libicu-dev \
openssh-client \
openssh-server \
python3.9 \
python3.9-distutils \
python3.9-venv \
&& rm -rf /var/lib/apt/lists/*

# Install Pytorch
RUN pip3 install networkx==3.1 pydantic
RUN pip3 install torch==2.1.0+cu118 --extra-index-url https://download.pytorch.org/whl/cu118
# Create and activate a Python venv
RUN python3.9 -m venv /opt/gs-venv
ENV PATH="/opt/gs-venv/bin:$PATH"

# Install DGL
RUN pip3 install dgl==1.0.4+cu117 -f https://data.dgl.ai/wheels/cu117/repo.html
ENV PYTHONPATH="/root/dgl/tools/:${PYTHONPATH}"
# Install GraphStorm dependencies
RUN pip install \
boto3==1.34.89 \
botocore==1.34.89 \
h5py==3.11.0 \
networkx==3.1 \
psutil==5.9.8 \
pyarrow==14.0.0 \
pydantic==2.7.0 \
scikit-learn==1.4.2 \
scipy==1.13.0 \
&& rm -rf /root/.cache

FROM base as base-cpu

# Install torch, DGL, and GSF deps that require torch
RUN pip install \
torch==${TORCH_VERSION} \
--index-url https://download.pytorch.org/whl/cpu \
&& rm -rf /root/.cache

# Install related Python packages
RUN pip3 install ogb==1.3.6 scipy pyarrow boto3 scikit-learn transformers
RUN pip install \
dgl==${DGL_VERSION} \
ogb==${OGB_VERSION} \
transformers==${TRANSFORMERS_VERSION} \
-f https://data.dgl.ai/wheels-internal/repo.html && rm -rf /root/.cache

# Install other dependencies
RUN apt-get install -y cython3 libicu-dev
RUN pip3 install h5py psutil
FROM base as base-gpu

RUN apt-get install -y unzip
# Install torch, DGL, and GSF deps that require torch
RUN pip install \
dgl==${DGL_VERSION}+cu121 \
ogb==${OGB_VERSION} \
torch==${TORCH_VERSION} \
transformers==${TRANSFORMERS_VERSION} \
-f https://data.dgl.ai/wheels/cu121/repo.html \
&& rm -rf /root/.cache

FROM base-${DEVICE} as runtime

ENV PYTHONPATH="/root/dgl/tools/:${PYTHONPATH}"

# Download DGL source code
RUN cd /root; git clone --recursive https://github.com/dmlc/dgl.git
RUN cd /root; git clone --single-branch https://github.com/dmlc/dgl.git

# Install GraphStorm from source code
# Copy GraphStorm source and add to PYTHONPATH
RUN mkdir -p /graphstorm
COPY code/python/graphstorm /graphstorm/python/graphstorm
ENV PYTHONPATH="/graphstorm/python/:${PYTHONPATH}"
Expand All @@ -39,16 +81,17 @@ COPY code/inference_scripts /graphstorm/inference_scripts
COPY code/tools /graphstorm/tools
COPY code/training_scripts /graphstorm/training_scripts

# Set up SSH
RUN apt-get install -y openssh-client openssh-server
# Set up SSH access
ENV SSH_PORT=2222

RUN cat /etc/ssh/sshd_config > /tmp/sshd_config && \
sed "0,/^#Port 22/s//Port ${SSH_PORT}/" /tmp/sshd_config > /etc/ssh/sshd_config
ENV SSHDIR $HOME/.ssh
RUN mkdir -p ${SSHDIR}
RUN ssh-keygen -t rsa -f ${SSHDIR}/id_rsa -N ''
RUN cp ${SSHDIR}/id_rsa.pub ${SSHDIR}/authorized_keys
RUN mkdir -p ${SSHDIR} \
&& ssh-keygen -t rsa -f ${SSHDIR}/id_rsa -N '' \
&& cp ${SSHDIR}/id_rsa.pub ${SSHDIR}/authorized_keys \
&& mkdir /run/sshd

EXPOSE ${SSH_PORT}

EXPOSE 2222
RUN mkdir /run/sshd
CMD ["/usr/sbin/sshd", "-D"]
35 changes: 31 additions & 4 deletions docker/build_docker_oss4local.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
#!/bin/bash
set -eox pipefail

# process argument 1: graphstorm home folder
if [ -z "$1" ]; then
echo "Please provide the graphstorm home folder that the graphstorm codes are cloned to."
echo "For example, ./build_docker_oss4local.sh /graph-storm/"
echo "Please provide a path to the root directory of the GraphStorm repository."
echo "For example, ./build_docker_oss4local.sh ../ graphstorm local gpu"
exit 1
else
GSF_HOME="$1"
Expand All @@ -23,6 +24,13 @@ else
TAG="$3"
fi

# process argument 4: docker image type, default is GPU
if [ -z "$4" ]; then
DEVICE_TYPE="gpu"
else
DEVICE_TYPE="$4"
fi

# Copy scripts and tools codes to the docker folder
mkdir -p $GSF_HOME"/docker/code"
cp -r $GSF_HOME"/python" $GSF_HOME"/docker/code/python"
Expand All @@ -31,11 +39,30 @@ cp -r $GSF_HOME"/inference_scripts" $GSF_HOME"/docker/code/inference_scripts"
cp -r $GSF_HOME"/tools" $GSF_HOME"/docker/code/tools"
cp -r $GSF_HOME"/training_scripts" $GSF_HOME"/docker/code/training_scripts"

aws ecr-public get-login-password --region us-east-1 | \
docker login --username AWS --password-stdin public.ecr.aws

# Build OSS docker for EC2 instances that an pull ECR docker images
DOCKER_FULLNAME="${IMAGE_NAME}:${TAG}"
DOCKER_FULLNAME="${IMAGE_NAME}:${TAG}-${DEVICE_TYPE}"

echo "Build a local docker image ${DOCKER_FULLNAME}"
docker build --no-cache -f $GSF_HOME"/docker/Dockerfile.local" . -t $DOCKER_FULLNAME

if [[ $DEVICE_TYPE = "gpu" ]]; then
SOURCE_IMAGE="nvidia/cuda:12.1.0-runtime-ubuntu20.04"
elif [[ $DEVICE_TYPE = "cpu" ]]; then
SOURCE_IMAGE="public.ecr.aws/ubuntu/ubuntu:20.04_stable"
else
echo >&2 -e "Image type can only be \"gpu\" or \"cpu\", but got \""$DEVICE_TYPE"\""
# remove the temporary code folder
rm -rf code
exit 1
fi

# Use Buildkit to avoid pulling both CPU and GPU images
DOCKER_BUILDKIT=1 docker build \
--build-arg DEVICE=$DEVICE_TYPE \
--build-arg SOURCE=${SOURCE_IMAGE} \
-f "${GSF_HOME}/docker/Dockerfile.local" . -t $DOCKER_FULLNAME

# remove the temporary code folder
rm -rf $GSF_HOME"/docker/code"
12 changes: 6 additions & 6 deletions docker/build_docker_sagemaker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ fi

# process argument 2: docker image type, default is GPU
if [ -z "$2" ]; then
IMAGE_TYPE="gpu"
DEVICE_TYPE="gpu"
else
IMAGE_TYPE="$2"
DEVICE_TYPE="$2"
fi

# process argument 3: docker image name, default is graphstorm
Expand All @@ -39,20 +39,20 @@ cp -r "${GSF_HOME}/sagemaker" code/graphstorm/sagemaker
cp -r "${GSF_HOME}/docker/sagemaker/build_artifacts" build_artifacts

# Build OSS docker for EC2 instances that an pull ECR docker images
DOCKER_FULLNAME="${IMAGE_NAME}:${TAG}"
DOCKER_FULLNAME="${IMAGE_NAME}:${TAG}-${DEVICE_TYPE}"

echo "Build a sagemaker docker image ${DOCKER_FULLNAME}"

# Log in to ECR to pull Docker image
aws ecr get-login-password --region us-east-1 \
| docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com

if [ $IMAGE_TYPE = "gpu" ] || [ $IMAGE_TYPE = "cpu" ]; then
if [ $DEVICE_TYPE = "gpu" ] || [ $DEVICE_TYPE = "cpu" ]; then
# Use Buildkit to avoid pulling both CPU and GPU images
DOCKER_BUILDKIT=1 docker build --build-arg DEVICE=$IMAGE_TYPE \
DOCKER_BUILDKIT=1 docker build --build-arg DEVICE=$DEVICE_TYPE \
-f "${GSF_HOME}/docker/sagemaker/Dockerfile.sm" . -t $DOCKER_FULLNAME
else
echo "Image type can only be \"gpu\" or \"cpu\", but got \""$IMAGE_TYPE"\""
echo "Device type can only be \"gpu\" or \"cpu\", but got \""$DEVICE_TYPE"\""
# remove the temporary code folder
rm -rf code
exit 1
Expand Down
61 changes: 37 additions & 24 deletions docs/source/advanced/own-models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -198,37 +198,50 @@ the ``ip_config`` argument specifies a ip configuration file, which contains the

Replace DGL DataLoader with the GraphStorm's dataset and dataloader
`````````````````````````````````````````````````````````````````````
Because the GraphStorm uses distributed graphs, we need to first load the partitioned graph, which is created in the :ref:`Step 1 <step-1>`, with the `GSgnnNodeTrainData <https://github.com/awslabs/graphstorm/blob/main/python/graphstorm/dataloading/dataset.py#L469>`_ class (for edge tasks, GraphStorm also provides `GSgnnEdgeTrainData <https://github.com/awslabs/graphstorm/blob/main/python/graphstorm/dataloading/dataset.py#L216>`_). The ``GSgnnNodeTrainData`` could be created as shown in the code below.
Because the GraphStorm uses distributed graphs, we need to first load the partitioned graph, which is created in the :ref:`Step 1 <step-1>`, with the `GSgnnData <https://github.com/awslabs/graphstorm/blob/main/python/graphstorm/dataloading/dataset.py#L57>`_ class (for edge tasks, the same class is used). The ``GSgnnData`` could be created as shown in the code below.

.. code-block:: python
train_data = GSgnnNodeTrainData(config.graph_name,
config.part_config,
train_ntypes=config.target_ntype,
node_feat_field=node_feat_fields,
label_field=config.label_field)
train_data = GSgnnData(config.part_config)
Arguments of this class include the partition configuration JSON file path, which are the outputs of the :ref:`Step 1 <step-1>`. The ``graph_name`` can be found in the JSON file.
Arguments of this class include the partition configuration JSON file path, which are the outputs of the :ref:`Step 1 <step-1>`.

The other values, the ``train_ntypes``, the ``label_field``, and the ``node_feat_field``, should be consistent with the values in the raw data :ref:`input configuration JSON <input-config>` defined in the :ref:`Step 1 <step-1>`. The ``train_ntypes`` is the ``node_type`` that has ``labels`` specified. The ``label_fields`` is the value specified in ``label_col`` of the ``train_ntype``. The ``node_feat_field`` is a dictionary, whose key is the values of ``node_type``, and value is the values of ``feature_name``.

Then we can put this dataset into GraphStorm's `GSgnnNodeDataLoader <https://github.com/awslabs/graphstorm/blob/main/python/graphstorm/dataloading/dataloading.py#L544>`_, which is like:
Then we can put this dataset into GraphStorm's `GSgnnNodeDataLoader <https://github.com/awslabs/graphstorm/blob/main/python/graphstorm/dataloading/dataloading.py#L1237>`_, which is like:

.. code-block:: python
# Get train idx
train_idxs = train_data.get_node_train_set(config.target_ntype)
# Define the GraphStorm train dataloader
dataloader = GSgnnNodeDataLoader(train_data, train_data.train_idxs, fanout=config.fanout,
batch_size=config.batch_size, device=device, train_task=True)
dataloader = GSgnnNodeDataLoader(train_data,
train_idxs, fanout=config.fanout,
batch_size=config.batch_size,
label_field=config.label_field,
node_feats=node_feat_fields,train_task=True)
# Optional: Define the evaluation dataloader
eval_dataloader = GSgnnNodeDataLoader(train_data, train_data.val_idxs,fanout=config.fanout,
batch_size=config.eval_batch_size, device=device,
val_idxs = train_data.get_node_val_set(eval_ntype)
eval_dataloader = GSgnnNodeDataLoader(train_data,
val_idxs,
fanout=config.fanout,
batch_size=config.eval_batch_size,
label_field=config.label_field,
node_feats=node_feat_fields,
train_task=False)
# Optional: Define the evaluation dataloader
test_dataloader = GSgnnNodeDataLoader(train_data, train_data.test_idxs,fanout=config.fanout,
batch_size=config.eval_batch_size, device=device,
test_idxs = train_data.get_node_test_set(eval_ntype)
test_dataloader = GSgnnNodeDataLoader(train_data,
test_idxs,
fanout=config.fanout,
batch_size=config.eval_batch_size,
label_field=config.label_field,
node_feats=node_feat_fields,
train_task=False)
GraphStorm provides a set of dataloaders for different GML tasks. Here we deal with a node task, hence using the node dataloader, which takes the graph data created above as the first argument. The second argument is the label index that the GraphStorm dataset extracts from the graph as indicated in the target nodes' ``train_mask``, ``val_mask``, and ``test_mask``, which are automatically generated by GraphStorm graph construction tool with the specified ``split_pct`` field. The ``GSgnnNodeTrainData`` automatically extracts these indexes out and set its properties so that you can directly use them like ``graph_data.train_idxs`` and ``graph_data.val_idxs``, and ``graph_data.test_idxs``. The rest of arguments are similar to the common training flow, except that we set the ``train_task`` to be ``False`` for the evaluation and test dataloader.
GraphStorm provides a set of dataloaders for different GML tasks. Here we deal with a node task, hence using the node dataloader, which takes the graph data created above as the first argument. The second argument is the label index that the GraphStorm dataset extracts from the graph as indicated in the target nodes' ``train_mask``, ``val_mask``, and ``test_mask``, which are automatically generated by GraphStorm graph construction tool with the specified ``split_pct`` field. The ``GSgnnData`` provides functions to get the indexes of train data, validation data and test data through ``get_node_train_set``, ``get_node_val_set`` and ``get_node_test_set``, respectively.
The ``label_field`` is also required by the GSgnnNodeDataLoader to get the labels for model training and evaluation.
The ``node_feats`` and ``edge_feats`` are optional to GSgnnNodeDataLoader, which define the node features and edge features, respectively, to be used for the task associated with the dataloader.
The rest of arguments are similar to the common training flow, except that we set the ``train_task`` to be ``False`` for the evaluation and test dataloader.

Use GraphStorm's model trainer to wrap your model and attach evaluator and task tracker to it
````````````````````````````````````````````````````````````````````````````````````````````````
Expand Down Expand Up @@ -263,13 +276,13 @@ The GraphStorm trainers can have evaluators and task trackers associated. The fo
.. code-block:: python
# Optional: set up a evaluator
evaluator = GSgnnAccEvaluator(config.eval_frequency,
config.eval_metric,
config.multilabel,
config.use_early_stop,
config.early_stop_burnin_rounds,
config.early_stop_rounds,
config.early_stop_strategy)
evaluator = GSgnnClassificationEvaluator(config.eval_frequency,
config.eval_metric,
config.multilabel,
config.use_early_stop,
config.early_stop_burnin_rounds,
config.early_stop_rounds,
config.early_stop_strategy)
trainer.setup_evaluator(evaluator)
# Optional: set up a task tracker to show the progress of training.
tracker = GSSageMakerTaskTracker(config.eval_frequency)
Expand Down
5 changes: 1 addition & 4 deletions docs/source/api/graphstorm.dataloading.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,7 @@ DataSets
:nosignatures:
:template: datasettemplate.rst

GSgnnNodeTrainData
GSgnnNodeInferData
GSgnnEdgeTrainData
GSgnnEdgeInferData
GSgnnData

DataLoaders
------------
Expand Down
16 changes: 9 additions & 7 deletions docs/source/api/graphstorm.eval.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@ graphstorm.eval
Learning (GML) tasks.

If users want to implement customized evaluators or evaluation methods, a best practice is to
extend base evaluators, i.e., the ``GSgnnInstanceEvaluator`` class for node or edge prediction
tasks, and ``GSgnnLPEvaluator`` for link prediction tasks, and then implement the abstract methods.
extend the base evaluator, i.e., the ``GSgnnBaseEvaluator``, and the corresponding evaluation
interfaces, e.g., ``GSgnnPredictionEvalInterface``` for prediction evaluation, and
``GSgnnLPRankingEvalInterface`` for ranking based link prediction evaluation, and then
implement the abstract methods defined in those interface classes.

.. currentmodule:: graphstorm.eval

Expand All @@ -20,8 +22,9 @@ Base Evaluators
:nosignatures:
:template: evaltemplate.rst

GSgnnInstanceEvaluator
GSgnnLPEvaluator
GSgnnBaseEvaluator
GSgnnPredictionEvalInterface
GSgnnLPRankingEvalInterface

Evaluators
-----------
Expand All @@ -31,8 +34,7 @@ Evaluators
:nosignatures:
:template: evaltemplate.rst

GSgnnLPEvaluator
GSgnnClassificationEvaluator
GSgnnRegressionEvaluator
GSgnnMrrLPEvaluator
GSgnnPerEtypeMrrLPEvaluator
GSgnnAccEvaluator
GSgnnRegressionEvaluator
7 changes: 4 additions & 3 deletions docs/source/gs-processing/gs-processing-getting-started.rst
Original file line number Diff line number Diff line change
Expand Up @@ -145,11 +145,12 @@ distributed training pipeline.
Running on AWS resources
------------------------

GSProcessing supports Amazon SageMaker and EMR Serverless as execution environments.
GSProcessing supports Amazon SageMaker, EMR on EC2, and EMR Serverless as execution environments.
To run distributed jobs on AWS resources we will have to build a Docker image
and push it to the Amazon Elastic Container Registry, which we cover in
:doc:`usage/distributed-processing-setup` and run a SageMaker Processing
job which we describe in :doc:`usage/amazon-sagemaker`, or EMR Serverless
:doc:`usage/distributed-processing-setup`. We can then run either a SageMaker Processing
job which we describe in :doc:`usage/amazon-sagemaker`, an EMR on EC2 job which
we describe in :doc:`usage/emr`, or an EMR Serverless
job that is covered in :doc:`usage/emr-serverless`.


Expand Down
Loading

0 comments on commit 31d525b

Please sign in to comment.