diff --git a/.github/workflow_scripts/lint_check.sh b/.github/workflow_scripts/lint_check.sh index 3eecaf93c3..f49c156926 100644 --- a/.github/workflow_scripts/lint_check.sh +++ b/.github/workflow_scripts/lint_check.sh @@ -4,6 +4,8 @@ cd ../../ set -ex python3 -m pip install --upgrade prospector pip +pip3 uninstall -y astroid +yes | pip3 install astroid==2.15.7 FORCE_CUDA=1 python3 -m pip install -e '.[test]' --no-build-isolation pylint --rcfile=./tests/lint/pylintrc ./python/graphstorm/data/*.py pylint --rcfile=./tests/lint/pylintrc ./python/graphstorm/dataloading/ diff --git a/docker/build_docker_wholegraph.sh b/docker/build_docker_wholegraph.sh new file mode 100644 index 0000000000..7caa3bef0a --- /dev/null +++ b/docker/build_docker_wholegraph.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# process argument 1: graphstorm home folder +if [ -z "$1" ]; then + echo "Please provide the graphstorm home folder that the graphstorm codes are cloned to." + echo "For example, ./build_docker_wholegraph.sh /graph-storm/" + exit 1 +else + GSF_HOME="$1" +fi + +# process argument 2: docker image name, default is graphstorm +if [ -z "$2" ]; then + IMAGE_NAME="graphstorm-wholegraph" +else + IMAGE_NAME="$2" +fi + +# process argument 3: image's tag name, default is local +if [ -z "$3" ]; then + TAG="local" +else + TAG="$3" +fi + +# Copy scripts and tools codes to the docker folder +mkdir -p $GSF_HOME"/docker/code" +cp -r $GSF_HOME"/python" $GSF_HOME"/docker/code/python" +cp -r $GSF_HOME"/inference_scripts" $GSF_HOME"/docker/code/inference_scripts" +cp -r $GSF_HOME"/tools" $GSF_HOME"/docker/code/tools" +cp -r $GSF_HOME"/training_scripts" $GSF_HOME"/docker/code/training_scripts" + +# Build OSS docker for EC2 instances that an pull ECR docker images +DOCKER_FULLNAME="${IMAGE_NAME}:${TAG}" + +echo "Build a local docker image ${DOCKER_FULLNAME}" +docker build --no-cache -f $GSF_HOME"/docker/wholegraph/Dockerfile" . -t $DOCKER_FULLNAME + +# remove the temporary code folder +rm -rf $GSF_HOME"/docker/code" diff --git a/docker/wholegraph/Dockerfile b/docker/wholegraph/Dockerfile new file mode 100644 index 0000000000..43b2278426 --- /dev/null +++ b/docker/wholegraph/Dockerfile @@ -0,0 +1,60 @@ +FROM nvcr.io/nvidia/dgl:23.07-py3 + +################################################# +## Install EFA installer +ARG EFA_INSTALLER_VERSION=latest +RUN cd $HOME \ + && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ + && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \ + && cd aws-efa-installer \ + && apt-get update \ + && apt-get install -y libhwloc-dev \ + && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ + && rm -rf /var/lib/apt/lists/* + +################################################### +## Install AWS-OFI-NCCL plugin +ARG AWS_OFI_NCCL_VERSION=v1.7.1-aws +RUN git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \ + && cd /opt/aws-ofi-nccl \ + && git checkout ${AWS_OFI_NCCL_VERSION} \ + && ./autogen.sh \ + && ./configure --prefix=/opt/aws-ofi-nccl/ \ + --with-libfabric=/opt/amazon/efa/ \ + --with-cuda=/usr/local/cuda \ + && make && make install + +ENV PATH "/opt/amazon/efa/bin:$PATH" + +# Install WholeGraph +COPY wholegraph/install_wholegraph.sh install_wholegraph.sh +RUN bash install_wholegraph.sh + +# Install GraphStorm +RUN pip install --no-cache-dir boto3 'h5py>=2.10.0' scipy tqdm 'pyarrow>=3' 'transformers==4.28.1' pandas pylint scikit-learn ogb psutil +RUN git clone https://github.com/awslabs/graphstorm + +# Increase nofile limit +RUN echo "root soft nofile 1048576" >> /etc/security/limits.conf \ + && echo "root hard nofile 1048576" >> /etc/security/limits.conf + +# Make EFA NCCL plugin the default plugin +RUN sed -i '/nccl_rdma_sharp_plugin/d' /etc/ld.so.conf.d/hpcx.conf \ + && echo "/opt/aws-ofi-nccl/lib" >> /etc/ld.so.conf.d/hpcx.conf \ + && ldconfig + +# Set up SSH +RUN apt-get update && apt-get install -y openssh-client openssh-server && rm -rf /var/lib/apt/lists/* +ENV SSH_PORT=2222 +RUN cat /etc/ssh/sshd_config > /tmp/sshd_config && \ + sed "0,/^#Port 22/s//Port ${SSH_PORT}/" /tmp/sshd_config > /etc/ssh/sshd_config +ENV HOME=/root +ENV SSHDIR $HOME/.ssh +RUN mkdir -p ${SSHDIR} +RUN ssh-keygen -t rsa -f ${SSHDIR}/id_rsa -N '' +RUN cp ${SSHDIR}/id_rsa.pub ${SSHDIR}/authorized_keys +RUN touch /root/.ssh/config;echo -e "Host *\n StrictHostKeyChecking no\n UserKnownHostsFile=/dev/null\n Port ${SSH_PORT}" > /root/.ssh/config +EXPOSE 2222 +RUN mkdir /run/sshd + +CMD ["/usr/sbin/sshd", "-D"] diff --git a/docker/wholegraph/install_wholegraph.sh b/docker/wholegraph/install_wholegraph.sh new file mode 100644 index 0000000000..e0d5b4fee2 --- /dev/null +++ b/docker/wholegraph/install_wholegraph.sh @@ -0,0 +1,25 @@ +#!/bin/bash +git clone https://github.com/fmtlib/fmt.git /opt/fmt +cd /opt/fmt +git checkout 9.1.0 +mkdir build && cd build +cmake -DCMAKE_POSITION_INDEPENDENT_CODE=TRUE .. +make +make install + +git clone https://github.com/gabime/spdlog.git /opt/spdlog +cd /opt/spdlog && mkdir build && cd build +cmake .. && make -j +cp libspdlog.a /usr/lib/libspdlog.a +export PYTHON=/usr/bin/python + +cd /opt/rapids/ +git clone https://github.com/rapidsai/wholegraph.git -b branch-23.08 +cd /opt/rapids/wholegraph/ +pip install scikit-build +export WHOLEGRAPH_CMAKE_CUDA_ARCHITECTURES="70-real;80-real;90" +# fix a bug in CMakeList.txt when build pylibwholegraph +old="import sysconfig; print(sysconfig.get_config_var('BINLIBDEST'))" +string="import sysconfig; print(\"%s/%s\" % (sysconfig.get_config_var(\"LIBDIR\"), sysconfig.get_config_var(\"INSTSONAME\")))" +sed -i "s|$old|$string|" /opt/rapids/wholegraph/python/pylibwholegraph/CMakeLists.txt +bash build.sh libwholegraph pylibwholegraph -v diff --git a/docs/source/_templates/dataloadertemplate.rst b/docs/source/_templates/dataloadertemplate.rst new file mode 100644 index 0000000000..f02d586215 --- /dev/null +++ b/docs/source/_templates/dataloadertemplate.rst @@ -0,0 +1,10 @@ +.. role:: hidden + :class: hidden-section +.. currentmodule:: {{ module }} + + +{{ name | underline}} + +.. autoclass:: {{ name }} + :show-inheritance: + :special-members: __iter__, __next__ \ No newline at end of file diff --git a/docs/source/_templates/datasettemplate.rst b/docs/source/_templates/datasettemplate.rst new file mode 100644 index 0000000000..b503bdbb1e --- /dev/null +++ b/docs/source/_templates/datasettemplate.rst @@ -0,0 +1,10 @@ +.. role:: hidden + :class: hidden-section +.. currentmodule:: {{ module }} + + +{{ name | underline}} + +.. autoclass:: {{ name }} + :show-inheritance: + :members: prepare_data, get_node_feats, get_edge_feats, get_labels diff --git a/docs/source/_templates/classtemplate.rst b/docs/source/_templates/evaltemplate.rst similarity index 100% rename from docs/source/_templates/classtemplate.rst rename to docs/source/_templates/evaltemplate.rst diff --git a/docs/source/_templates/inferencetemplate.rst b/docs/source/_templates/inferencetemplate.rst new file mode 100644 index 0000000000..c5a289df3e --- /dev/null +++ b/docs/source/_templates/inferencetemplate.rst @@ -0,0 +1,11 @@ +.. role:: hidden + :class: hidden-section +.. currentmodule:: {{ module }} + + +{{ name | underline}} + +.. autoclass:: {{ name }} + :show-inheritance: + :members: setup_device, setup_evaluator, evaluator, device, infer + diff --git a/docs/source/_templates/modeltemplate.rst b/docs/source/_templates/modeltemplate.rst new file mode 100644 index 0000000000..06a2366ca1 --- /dev/null +++ b/docs/source/_templates/modeltemplate.rst @@ -0,0 +1,10 @@ +.. role:: hidden + :class: hidden-section +.. currentmodule:: {{ module }} + + +{{ name | underline}} + +.. autoclass:: {{ name }} + :show-inheritance: + :members: forward, save_model, restore_model, predict, create_optimizer \ No newline at end of file diff --git a/docs/source/_templates/trainertemplate.rst b/docs/source/_templates/trainertemplate.rst new file mode 100644 index 0000000000..e4023be1c7 --- /dev/null +++ b/docs/source/_templates/trainertemplate.rst @@ -0,0 +1,12 @@ +.. role:: hidden + :class: hidden-section +.. currentmodule:: {{ module }} + + +{{ name | underline}} + +.. autoclass:: {{ name }} + :show-inheritance: + :members: setup_device, setup_evaluator, save_model, remove_saved_model, save_topk_models, + get_best_model_path, restore_model, evaluator, optimizer, device, fit, eval + diff --git a/docs/source/api/graphstorm.customized.rst b/docs/source/api/graphstorm.customized.rst deleted file mode 100644 index 9172b469f0..0000000000 --- a/docs/source/api/graphstorm.customized.rst +++ /dev/null @@ -1,62 +0,0 @@ -.. _apicustomized: - -customized model APIs -========================== - - GraphStorm provides a set of APIs for users to integrate their own customized models with - the framework of GraphStorm, so that users' own models can leverage GraphStorm's easy-to-use - and distributed capabilities. - - For how to modify users' own models, please refer to this :ref:`Use Your Own Model Tutorial - `. - - In general, there are three sets of APIs involved in programming customized models. - - * Dataloaders: users need to extend GraphStorm's abstract node or edge dataloader to implement - their own graph samplers or mini_batch generators. - * Models: depending on specific GML tasks, users need to extend the corresponding ModelBase and - ModelInterface, and then implement the required abstract functions. - * Evaluators: if necessary, users can also extend the two evaluator templates to implement their - own performance evaluation method. - -.. currentmodule:: graphstorm - -Dataloaders ------------- -.. autosummary:: - :toctree: ../generated/ - :nosignatures: - :template: classtemplate.rst - - .. dataloading.AbsNodeDataLoader - .. dataloading.AbsEdgeDataLoader - -Models ------------- - -.. autosummary:: - :toctree: ../generated/ - :nosignatures: - :template: classtemplate.rst - - model.GSgnnModelBase - model.GSgnnNodeModelBase - model.GSgnnEdgeModelBase - model.GSgnnLinkPredictionModelBase - model.GSgnnNodeModelInterface - model.GSgnnEdgeModelInterface - model.GSgnnLinkPredictionModelInterface - -Evaluators ------------- - - If users want to implement customized evaluators or evaluation methods, a best practice is to - extend the ``eval.GSgnnInstanceEvaluator`` class, and implement the abstract methods. - -.. autosummary:: - :toctree: ../generated/ - :nosignatures: - :template: classtemplate.rst - - eval.GSgnnInstanceEvaluator - eval.GSgnnLPEvaluator \ No newline at end of file diff --git a/docs/source/api/graphstorm.dataloading.rst b/docs/source/api/graphstorm.dataloading.rst index 9e4b4175ed..b782602eb9 100644 --- a/docs/source/api/graphstorm.dataloading.rst +++ b/docs/source/api/graphstorm.dataloading.rst @@ -3,29 +3,48 @@ graphstorm.dataloading ========================== - GraphStorm dataloading module includes a set of graph datasets and dataloaders for different + GraphStorm dataloading module includes a set of graph DataSets and DataLoaders for different graph machine learning tasks. + If users would like to customize DataLoaders, please extend those classes in the + :ref:`Base DataLoaders ` section and customize their abstract methods. + .. currentmodule:: graphstorm.dataloading +.. _basedataloaders: + +Base DataLoaders +------------------- + +.. autosummary:: + :toctree: ../generated/ + :nosignatures: + :template: dataloadertemplate.rst + + GSgnnNodeDataLoaderBase + GSgnnEdgeDataLoaderBase + GSgnnLinkPredictionDataLoaderBase + DataSets ------------ + .. autosummary:: :toctree: ../generated/ :nosignatures: - :template: classtemplate.rst + :template: datasettemplate.rst GSgnnNodeTrainData GSgnnNodeInferData GSgnnEdgeTrainData GSgnnEdgeInferData -Dataloaders +DataLoaders ------------ + .. autosummary:: :toctree: ../generated/ :nosignatures: - :template: classtemplate.rst + :template: dataloadertemplate.rst GSgnnNodeDataLoader GSgnnEdgeDataLoader diff --git a/docs/source/api/graphstorm.eval.rst b/docs/source/api/graphstorm.eval.rst new file mode 100644 index 0000000000..a8193f0f2c --- /dev/null +++ b/docs/source/api/graphstorm.eval.rst @@ -0,0 +1,38 @@ +.. _apieval: + +graphstorm.eval +======================= + + GraphStorm provides built-in evaluation methods for different Graph Machine + Learning (GML) tasks. + + If users want to implement customized evaluators or evaluation methods, a best practice is to + extend base evaluators, i.e., the ``GSgnnInstanceEvaluator`` class for node or edge prediction + tasks, and ``GSgnnLPEvaluator`` for link prediction tasks, and then implement the abstract methods. + +.. currentmodule:: graphstorm.eval + +Base Evaluators +---------------- + +.. autosummary:: + :toctree: ../generated/ + :nosignatures: + :template: evaltemplate.rst + + GSgnnInstanceEvaluator + GSgnnLPEvaluator + +Evaluators +----------- + +.. autosummary:: + :toctree: ../generated/ + :nosignatures: + :template: evaltemplate.rst + + GSgnnLPEvaluator + GSgnnMrrLPEvaluator + GSgnnPerEtypeMrrLPEvaluator + GSgnnAccEvaluator + GSgnnRegressionEvaluator diff --git a/docs/source/api/graphstorm.evaluator.rst b/docs/source/api/graphstorm.evaluator.rst deleted file mode 100644 index 35549b2946..0000000000 --- a/docs/source/api/graphstorm.evaluator.rst +++ /dev/null @@ -1,20 +0,0 @@ -.. _apievaluator: - -graphstorm.evaluator -======================= - - GraphStorm evaluators provides built-in evaluation methods for different Graph Machine - Learning (GML). - -.. currentmodule:: graphstorm.eval -.. autosummary:: - :toctree: ../generated/ - :nosignatures: - :template: classtemplate.rst - - GSgnnLPEvaluator - GSgnnMrrLPEvaluator - GSgnnPerEtypeMrrLPEvaluator - GSgnnAccEvaluator - GSgnnRegressionEvaluator - diff --git a/docs/source/api/graphstorm.inferrer.rst b/docs/source/api/graphstorm.inference.rst similarity index 85% rename from docs/source/api/graphstorm.inferrer.rst rename to docs/source/api/graphstorm.inference.rst index 7ec4d7f4e4..15ef5d8a88 100644 --- a/docs/source/api/graphstorm.inferrer.rst +++ b/docs/source/api/graphstorm.inference.rst @@ -1,6 +1,6 @@ -.. _apiinferrer: +.. _apiinference: -graphstorm.inferrer +graphstorm.inference ==================== GraphStorm inferrers assemble the distributed inference pipeline for different tasks. @@ -13,7 +13,7 @@ graphstorm.inferrer .. autosummary:: :toctree: ../generated/ :nosignatures: - :template: classtemplate.rst + :template: inferencetemplate.rst GSgnnLinkPredictionInferrer GSgnnNodePredictionInferrer diff --git a/docs/source/api/graphstorm.model.rst b/docs/source/api/graphstorm.model.rst index 76053c15eb..b4052bbd1a 100644 --- a/docs/source/api/graphstorm.model.rst +++ b/docs/source/api/graphstorm.model.rst @@ -1,9 +1,9 @@ .. _apimodel: graphstorm.model -================= +=================== - A GraphStorm model normally contains three components: + A GraphStorm model may contain three components: * Input layer: a set of modules to convert input data for different use cases, e.g., embedding texture features. @@ -11,9 +11,25 @@ graphstorm.model * Decoder: a set of modules to convert results from encoders for different tasks, e.g., classification, regression, or link prediction. + Currently GraphStorm releases the first two set of components. + + If users would like to implement their own model, the best practice is to extend the corresponding ``***ModelBase``, and implement the abstract methods. + .. currentmodule:: graphstorm.model -Model input layers +Base models +------------ + +.. autosummary:: + :toctree: ../generated/ + :nosignatures: + :template: modeltemplate.rst + + GSgnnNodeModelBase + GSgnnEdgeModelBase + GSgnnLinkPredictionModelBase + +Input Layers ------------------- .. autosummary:: :toctree: ../generated/ @@ -24,7 +40,7 @@ Model input layers GSLMNodeEncoderInputLayer GSPureLMNodeInputLayer -Model encoders and layers +Encoders and GNN Layers -------------------------- .. autosummary:: :toctree: ../generated/ diff --git a/docs/source/api/graphstorm.rst b/docs/source/api/graphstorm.rst index 5de2db120d..2362fdee44 100644 --- a/docs/source/api/graphstorm.rst +++ b/docs/source/api/graphstorm.rst @@ -9,13 +9,12 @@ graphstorm Users can directly use the following code to use these functions. >>> import graphstorm as gs - >>> gs.initialize() - >>> gs.get_rank() + >>> gs.initialize(ip_config="/tmp/ip_list.txt", backend="gloo") + >>> gs.setup_device(local_rank) .. autosummary:: :toctree: ../generated/ + :nosignatures: gsf.initialize - gsf.get_feat_size - utils.get_rank - utils.get_world_size + utils.setup_device diff --git a/docs/source/api/graphstorm.trainer.rst b/docs/source/api/graphstorm.trainer.rst index a90d5c23ff..fd66109f62 100644 --- a/docs/source/api/graphstorm.trainer.rst +++ b/docs/source/api/graphstorm.trainer.rst @@ -11,32 +11,15 @@ graphstorm.trainer .. currentmodule:: graphstorm.trainer - -Base class +Trainers -------------- -.. autosummary:: - :toctree: ../generated/ - :nosignatures: - :template: classtemplate.rst - - GSgnnTrainer -Task classes ------------------ .. autosummary:: :toctree: ../generated/ :nosignatures: - :template: classtemplate.rst + :template: trainertemplate.rst GSgnnLinkPredictionTrainer GSgnnNodePredictionTrainer GSgnnEdgePredictionTrainer - -Method classes ------------------ -.. autosummary:: - :toctree: ../generated/ - :nosignatures: - :template: classtemplate.rst - GLEMNodePredictionTrainer diff --git a/docs/source/configuration/configuration-gconstruction.rst b/docs/source/configuration/configuration-gconstruction.rst index 64feca1f97..b163b08d9f 100644 --- a/docs/source/configuration/configuration-gconstruction.rst +++ b/docs/source/configuration/configuration-gconstruction.rst @@ -87,7 +87,7 @@ Currently, the graph construction pipeline supports the following feature transf * **HuggingFace tokenizer transformation** tokenizes text strings with a HuggingFace tokenizer. The ``name`` field in the feature transformation dictionary is ``tokenize_hf``. The dict should contain two additional fields. ``bert_model`` specifies the LM model used for tokenization. Users can choose any `HuggingFace LM models `_ from one of the following types: ``"bert", "roberta", "albert", "camembert", "ernie", "ibert", "luke", "mega", "mpnet", "nezha", "qdqbert","roc_bert"``. ``max_seq_length`` specifies the maximal sequence length. * **HuggingFace LM transformation** encodes text strings with a HuggingFace LM model. The ``name`` field in the feature transformation dictionary is ``bert_hf``. The dict should contain two additional fields. ``bert_model`` specifies the LM model used for embedding text. Users can choose any `HuggingFace LM models `_ from one of the following types: ``"bert", "roberta", "albert", "camembert", "ernie", "ibert", "luke", "mega", "mpnet", "nezha", "qdqbert","roc_bert"``. ``max_seq_length`` specifies the maximal sequence length. -* **Numerical MAX_MIN transformation** normalizes numerical input features with `val = (val-min)/(max-min)`, where `val` is the feature value, `max` is the maximum number in the feature and `min` is the minimum number in the feature. The ``name`` field in the feature transformation dictionary is ``max_min_norm``. The dict can contains two optional fields. ``max_bound`` specifies the maximum value allowed in the feature. Any number larger than ``max_bound`` will be set to ``max_bound``. ``min_bound`` specifies the minimum value allowed in the feature. Any number smaller than ``min_bound`` will be set to ``min_bound``. +* **Numerical MAX_MIN transformation** normalizes numerical input features with `val = (val-min)/(max-min)`, where `val` is the feature value, `max` is the maximum number in the feature and `min` is the minimum number in the feature. The ``name`` field in the feature transformation dictionary is ``max_min_norm``. The dict can contain four optional fields: ``max_bound``, ``min_bound``, ``max_val`` and ``min_val``. ``max_bound`` specifies the maximum value allowed in the feature. Any number larger than ``max_bound`` will be set to ``max_bound``. Here, `max` = min(np.amax(feats), ``max_bound``). ``min_bound`` specifies the minimum value allowed in the feature. Any number smaller than ``min_bound`` will be set to ``min_bound``. Here, `min` = max(np.amin(feats), ``min_bound``). ``max_val`` defines the `max` in the transformation formula. When ``max_val`` is provided, `max` is always equal to ``max_val``. ``min_val`` defines the `min` in the transformation formula. When ``min_val`` is provided, `min` is always equal to ``min_val``. ``max_val`` and ``min_val`` are mainly used in the inference stage, where we want to use the max & min values computed in the training stage to normalize inference data. * **Numerical Rank Gauss transformation** normalizes numerical input features with rank gauss normalization. It maps the numeric feature values to gaussian distribution based on ranking. The method follows https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/discussion/44629#250927. The ``name`` field in the feature transformation dictionary is ``rank_gauss``. The dict can contains one optional field, i.e., ``epsilon`` which is used to avoid INF float during computation. * **Convert to categorical values** converts text data to categorial values. The `name` field is `to_categorical`. `separator` specifies how to split the string into multiple categorical values (this is only used to define multiple categorical values). If `separator` is not specified, the entire string is a categorical value. `mapping` is a dict that specifies how to map a string to an integer value that defines a categorical value. diff --git a/docs/source/index.rst b/docs/source/index.rst index 1ebee6708d..f745ad9913 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -33,18 +33,17 @@ Welcome to the GraphStorm Documentation and Tutorials advanced/advanced-usages .. toctree:: - :maxdepth: 2 + :maxdepth: 1 :caption: API Reference :hidden: :glob: api/graphstorm api/graphstorm.dataloading + api/graphstorm.eval + api/graphstorm.inference api/graphstorm.model api/graphstorm.trainer - api/graphstorm.inferrer - api/graphstorm.evaluator - api/graphstorm.customized GraphStorm is a graph machine learning (GML) framework designed for enterprise use cases. It simplifies the development, training and deployment of GML models on industry-scale graphs (measured in billons of nodes and edges) by providing scalable training and inference pipelines of GML models. GraphStorm comes with a collection of built-in GML models, allowing users to train a GML model with a single command, eliminating the need to write any code. Moreover, GraphStorm provides a wide range of configurations to customiz model implementations and training pipelines, enhancing model performance. In addition, GraphStorm offers a programming interface that enables users to train custom GML models in a distributed manner. Users can bring their own model implementations and leverage the GraphStorm training pipeline for scalability. diff --git a/examples/mag/README.md b/examples/mag/README.md index ab45834787..396cccf718 100644 --- a/examples/mag/README.md +++ b/examples/mag/README.md @@ -110,7 +110,7 @@ python3 -m graphstorm.run.gs_node_classification \ The accuracy is 41.88%. -### Fine-tune BERT model on the graph data and train GNN model to predict the venue +### Fine-tune BERT model on the graph data and train GNN model to predict the venue To achieve good performance, we should fine-tune the BERT model on the graph data. One way of fine-tuning the BERT model on the graph data is to fine-tune the BERT model @@ -188,3 +188,42 @@ python3 -m graphstorm.run.gs_node_classification \ The accuracy of RGCN with the BERT model fine-tuned with venue prediction is 63.22%, while the accuracy of HGT is 67.20%. + +### Co-training BERT and GNN models using GLEM to predict the venue + +[GLEM](https://arxiv.org/abs/2210.14709) is a variational EM framework that trains a LM and GNN iteratively for semi-supervised node classification. There are two important pre-requisite for achieve good performance with GLEM + +1. The pseudolabeling technique: it predicts pseudolabels on the unlabeled nodes and uses as additional supervision signal for mutual distillation between LM and GNN. This can be enabled by the `--use-pseudolabel true` argument in command line. +2. Well pre-trained LM and GNN before the co-training: empirically, LM or GNN models that are not well-trained lead to degraded performance when co-training with GLEM directly. Therefore, we suggest user to pre-train the LM and GNN first. This can be achieved by: + 1. Setting `num_pretrain_epochs` in the [yaml config](mag_glem_w_pretrain.yaml). + + ``` + python3 -m graphstorm.run.gs_node_classification \ + --num-trainers 8 \ + --num-servers 4 \ + --num-samplers 0 \ + --part-config mag_min_4parts/mag.json \ + --ip-config ip_list_4p.txt \ + --cf mag_glem_w_pretrain.yaml \ + --use-pseudolabel true + ``` + + 2. Restoring pretrained model from checkpoints using `--restore-model-path`. In the following example, we restore the GNN trained on fine-tuned BERT model in the [previous section](#bert-ft-gnn). GLEM requires checkpoints of LM and GNN to be in the same path, under separate directories `LM` and `GNN`. It then loads the LM's `node_input_encoder` and GNN's `gnn_encoder` and `decoder`. Since our GNN checkpoint contain both the fine-tuned LM and GNN, we set up softlinks to point both LM and GNN to this checkpiont. + + ``` + # prepare paths to pretrained models: + mkdir mag_pretrained_models + ln -s mag_gnn_nc_model/epoch-7 mag_pretrained_models/LM + ln -s mag_gnn_nc_model/epoch-7 mag_pretrained_models/GNN + + # co-training pre-trained LM and GNN with GLEM: + python3 -m graphstorm.run.gs_node_classification \ + --num-trainers 8 \ + --num-servers 4 \ + --num-samplers 0 \ + --part-config mag_min_4parts/mag.json \ + --ip-config ip_list_4p.txt \ + --cf mag_glem_nc.yaml \ + --use-pseudolabel true \ + --restore-model-path mag_pretrained_models + ``` \ No newline at end of file diff --git a/examples/mag/mag_glem_nc.yaml b/examples/mag/mag_glem_nc.yaml new file mode 100644 index 0000000000..938ca1cbcb --- /dev/null +++ b/examples/mag/mag_glem_nc.yaml @@ -0,0 +1,62 @@ +--- +version: 1.0 +lm_model: + node_lm_models: + - + lm_type: bert + model_name: "bert-base-uncased" + gradient_checkpoint: false + node_types: + - paper + - + lm_type: bert + model_name: "bert-base-uncased" + gradient_checkpoint: false + node_types: + - fos +gsf: + basic: + backend: gloo + verbose: false + save_perf_results_path: null + lmgnn: + lm_train_nodes: 128 + lm_infer_batch_size: 128 + freeze_lm_encoder_epochs: 0 + model_encoder_type: rgcn + fanout: "5,5" + num_layers: 2 + hidden_size: 128 + use_mini_batch_infer: false + training_method: + name: glem + kwargs: + em_order_gnn_first: false + inference_using_gnn: true + pl_weight: 0.5 + num_pretrain_epochs: 0 + input: + restore_model_path: null + output: + save_model_path: null + save_embed_path: null + hyperparam: + dropout: 0. + lr: 0.00003 + lm_tune_lr: 0.00003 + sparse_optimizer_lr: 0.01 + num_epochs: 3 + batch_size: 128 + eval_batch_size: 128 + wd_l2norm: 0 + no_validation: false + rgcn: + num_bases: -1 + use_self_loop: false + lp_decoder_type: dot_product + use_node_embeddings: false + node_classification: + target_ntype: "paper" + label_field: "venue" + multilabel: false + num_classes: 1523 \ No newline at end of file diff --git a/examples/mag/mag_glem_w_pretrain.yml b/examples/mag/mag_glem_w_pretrain.yml new file mode 100644 index 0000000000..094bbb931b --- /dev/null +++ b/examples/mag/mag_glem_w_pretrain.yml @@ -0,0 +1,62 @@ +--- +version: 1.0 +lm_model: + node_lm_models: + - + lm_type: bert + model_name: "bert-base-uncased" + gradient_checkpoint: false + node_types: + - paper + - + lm_type: bert + model_name: "bert-base-uncased" + gradient_checkpoint: false + node_types: + - fos +gsf: + basic: + backend: gloo + verbose: false + save_perf_results_path: null + lmgnn: + lm_train_nodes: 64 + lm_infer_batch_size: 64 + freeze_lm_encoder_epochs: 0 + model_encoder_type: rgcn + fanout: "5,5" + eval_fanout: "20,20" + num_layers: 2 + hidden_size: 128 + use_mini_batch_infer: false + training_method: + name: glem + kwargs: + em_order_gnn_first: false + inference_using_gnn: true + pl_weight: 0.5 + num_pretrain_epochs: 10 + input: + restore_model_path: null + output: + save_model_path: null + save_embed_path: null + hyperparam: + dropout: 0. + lr: 0.00003 + lm_tune_lr: 0.00003 + sparse_optimizer_lr: 0.01 + num_epochs: 3 + batch_size: 64 + eval_batch_size: 64 + wd_l2norm: 0 + no_validation: false + rgcn: + num_bases: -1 + use_self_loop: true + use_node_embeddings: false + node_classification: + target_ntype: "paper" + label_field: "venue" + multilabel: false + num_classes: 1523 \ No newline at end of file diff --git a/graphstorm-processing/docs/Makefile b/graphstorm-processing/docs/Makefile new file mode 100644 index 0000000000..d0c3cbf102 --- /dev/null +++ b/graphstorm-processing/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/graphstorm-processing/docs/make.bat b/graphstorm-processing/docs/make.bat new file mode 100644 index 0000000000..6247f7e231 --- /dev/null +++ b/graphstorm-processing/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/graphstorm-processing/docs/source/conf.py b/graphstorm-processing/docs/source/conf.py new file mode 100644 index 0000000000..7334ba97ae --- /dev/null +++ b/graphstorm-processing/docs/source/conf.py @@ -0,0 +1,53 @@ +# pylint: skip-file +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = 'graphstorm-processing' +copyright = '2023, AGML Team' +author = 'AGML Team, Amazon' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'alabaster' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] diff --git a/graphstorm-processing/docs/source/developer/developer-guide.rst b/graphstorm-processing/docs/source/developer/developer-guide.rst new file mode 100644 index 0000000000..1a7faf85db --- /dev/null +++ b/graphstorm-processing/docs/source/developer/developer-guide.rst @@ -0,0 +1,230 @@ +Developer Guide +--------------- + +The project is set up using ``poetry`` to make easier for developers to +jump into the project. + +The steps we recommend are: + +Install JDK 8, 11 +~~~~~~~~~~~~~~~~~ + +PySpark requires a compatible Java installation to run, so +you will need to ensure your active JDK is using either +Java 8 or 11. + +On MacOS you can do this using ``brew``: + +.. code-block:: bash + + brew install openjdk@11 + +On Linux it will depend on your distribution's package +manager. For Ubuntu you can use: + +.. code-block:: bash + + sudo apt install openjdk-11-jdk + +On Amazon Linux 2 you can use: + +.. code-block:: bash + + sudo yum install java-11-amazon-corretto-headless + sudo yum install java-11-amazon-corretto-devel + +Install ``pyenv`` +~~~~~~~~~~~~~ + +``pyenv`` is a tool to manage multiple Python version installations. It +can be installed through the installer below on a Linux machine: + +.. code-block:: bash + + curl -L https://github.com/pyenv/pyenv-installer/raw/master/bin/pyenv-installer | bash + +or use ``brew`` on a Mac: + +.. code-block:: bash + + brew update + brew install pyenv + +For more info on ``pyenv`` see `its documentation. ` + +Create a Python 3.9 env and activate it. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We use Python 3.9 in our images so this most closely resembles the +execution environment on our Docker images that will be used for distributed +training. + +.. code-block:: bash + + pyenv install 3.9 + pyenv global 3.9 + +.. + + Note: We recommend not mixing up ``conda`` and ``pyenv``. When developing for + this project, simply ``conda deactivate`` until there's no ``conda`` + env active (even ``base``) and just rely on ``pyenv`` and ``poetry`` to handle + dependencies. + +Install ``poetry`` +~~~~~~~~~~~~~~ + +``poetry`` is a dependency and build management system for Python. To install it +use: + +.. code-block:: bash + + curl -sSL https://install.python-poetry.org | python3 - + +Install dependencies through ``poetry`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Now we are ready to install our dependencies through ``poetry``. + +We have split the project dependencies into the “main” dependencies that +``poetry`` installs by default, and the ``dev`` dependency group that +installs that dependencies that are only needed to develop the library. + +**On a POSIX system** (tested on Ubuntu, CentOS, MacOS) run: + +.. code-block:: bash + + # Install all dependencies into local .venv + poetry install --with dev + +Once all dependencies are installed you should be able to run the unit +tests for the project and continue with development using: + +.. code-block:: bash + + poetry run pytest ./graphstorm-processing/tests + +You can also activate and use the virtual environment using: + +.. code-block:: bash + + poetry shell + # We're now using the graphstorm-processing-py3.9 env so we can just run + pytest ./graphstorm-processing/tests + +To learn more about ``poetry`` see its `documentation `_ + +Use ``black`` to format code [optional] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We use `black `_ to +format code in this project. ``black`` is an opinionated formatter that +helps speed up development and code reviews. It is included in our +``dev`` dependencies so it will be installed along with the other dev +dependencies. + +To use ``black`` in the project you can run (from the project's root, +same level as ``pyproject.toml``) + +.. code-block:: bash + + # From the project's root directory, graphstorm-processing run: + black . + +To get a preview of the changes ``black`` would make you can use: + +.. code-block:: bash + + black . --diff --color + +You can auto-formatting with ``black`` to VSCode using the `Black +Formatter `__ + + +Use mypy and pylint to lint code +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We include the ``mypy`` and ``pylint`` linters as a dependency under the ``dev`` group +of dependencies. These linters perform static checks on your code and +can be used in a complimentary manner. + +We recommend `using VSCode and enabling the mypy linter `_ +to get in-editor annotations. + +You can also lint the project code through: + +.. code-block:: bash + + poetry run mypy ./graphstorm_processing + +To learn more about ``mypy`` and how it can help development +`see its documentation `_. + + +Our goal is to minimize ``mypy`` errors as much as possible for the +project. New code should be linted and not introduce additional mypy +errors. When necessary it's OK to use ``type: ignore`` to silence +``mypy`` errors inline, but this should be used sparingly. + +As a project, GraphStorm requires a 10/10 pylint score, so +ensure your code conforms to the expectation by running + +.. code-block:: bash + + pylint --rcfile=/path/to/graphstorm/tests/lint/pylintrc + +on your code before commits. To make this easier we include +a pre-commit hook below. + +Use a pre-commit hook to ensure ``black`` and ``pylint`` runs before commits +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To make code formatting and ``pylint`` checks easier for graphstorm-processing +developers, we recommend using a pre-commit hook. + +We include ``pre-commit`` in the project's ``dev`` dependencies, so once +you have activated the project's venv (``poetry shell``) you can just +create a file named ``.pre-commit-config.yaml`` with the following contents: + +.. code-block:: yaml + + # .pre-commit-config.yaml + repos: + - repo: https://github.com/psf/black + rev: 23.7.0 + hooks: + - id: black + language_version: python3.9 + files: 'graphstorm_processing\/.*\.pyi?$|tests\/.*\.pyi?$|scripts\/.*\.pyi?$' + exclude: 'python\/.*\.pyi' + - repo: local + hooks: + - id: pylint + name: pylint + entry: pylint + language: system + types: [python] + args: + [ + "--rcfile=./tests/lint/pylintrc" + ] + + +And then run: + +.. code-block:: bash + + pre-commit install + +which will install the ``black`` and ``pylin`` hooks into your local repository and +ensure it runs before every commit. + +.. note:: + + The pre-commit hook will also apply to all commits you make to the root + GraphStorm repository. Since that Graphstorm doesn't use ``black``, you might + want to remove the hooks. You can do so from the root repo + using ``rm -rf .git/hooks``. + + Both projects use ``pylint`` to check Python files so we'd still recommend using + that hook even if you're doing development for both GSProcessing and GraphStorm. diff --git a/graphstorm-processing/docs/source/developer/input-configuration.rst b/graphstorm-processing/docs/source/developer/input-configuration.rst new file mode 100644 index 0000000000..e6e2d7ae98 --- /dev/null +++ b/graphstorm-processing/docs/source/developer/input-configuration.rst @@ -0,0 +1,430 @@ +.. _input-configuration: + +GraphStorm Processing Input Configuration +========================================= + +GraphStorm Processing uses a JSON configuration file to +parse and process the data into the format needed +by GraphStorm partitioning and training downstream. + +We use this configuration format as an intermediate +between other config formats, such as the one used +by the single-machine GConstruct module. + +GSProcessing can take a GConstruct-formatted file +directly, and we also provide `a script ` +that can convert a `GConstruct ` +input configuration file into the ``GSProcessing`` format, +although this is mostly aimed at developers, users are +can rely on the automatic conversion. + +The GSProcessing input data configuration has two top-level objects: + +.. code-block:: json + + { + "version": "gsprocessing-v1.0", + "graph": {} + } + +- ``version`` (String, required): The version of configuration file being used. We include + the package name to allow self-contained identification of the file format. +- ``graph`` (JSON object, required): one configuration object that defines each + of the node types and edge types that describe the graph. + +We describe the ``graph`` object next. + +``graph`` configuration object +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``graph`` configuration object can have two top-level objects: + +.. code-block:: json + + { + "edges": [{}], + "nodes": [{}] + } + +- ``edges``: (array of JSON objects, required). Each JSON object + in this array describes one edge type and determines how the edge + structure will be parsed. +- ``nodes``: (array of JSON objects, optional). Each JSON object + in this array describes one node type. This key is optional, in case + it is missing, node IDs are derived from the ``edges`` objects. + +-------------- + +Contents of an ``edges`` configuration object +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +An ``edges`` configuration object can contain the following top-level +objects: + +.. code-block:: json + + { + "data": { + "format": "String", + "files": ["String"], + "separator": "String" + }, + "source": {"column": "String", "type": "String"}, + "relation": {"type": "String"}, + "destination": {"column": "String", "type": "String"}, + "labels" : [ + { + "column": "String", + "type": "String", + "split_rate": { + "train": "Float", + "val": "Float", + "test": "Float" + } + }, + ] + "features": [{}] + } + +- ``data`` (JSON Object, required): Describes the physical files + that store the data described in this object. The JSON object has two + top level objects: + + - ``format`` (String, required): indicates the format the data is + stored in. We accept either ``"csv"`` or ``"parquet"`` as valid + file formats. + + - ``files`` (array of String, required): the physical location of + files. The format accepts two options: + + - a single-element list a with directory-like (ending in ``/``) + **relative** path under which all the files that correspond to + the current edge type are stored. + + - e.g. ``"files": ['path/to/edge/type/']`` + - This option allows for concise listing of entire types and + would be preferred. All the files under the path will be loaded. + + - a multi-element list of **relative** file paths. + + - ``"files": ['path/to/edge/type/file_1.csv', 'path/to/edge/type/file_2.csv']`` + - This option allows for multiple types to be stored under the + same input prefix, but will result in more verbose spec + files. + + - Since the spec expects **relative paths**, the caller is + responsible for providing a path prefix to the execution + engine. The prefix will determine if the source is a local + filesystem or S3, allowing the spec to be portable, i.e. a user + can move the physical files and the spec will still be valid, + as long as the relative structure is kept. + + - ``separator`` (String, optional): Only relevant for CSV files, + determines the separator used between each column in the files. + +- ``source``: (JSON object, required): Describes the source nodes + for the edge type. The top-level keys for the object are: + + - ``column``: (String, required) The name of the column in the + physical data files. + - ``type``: (String, optional) The type name of the nodes. If not + provided, we assume that the column name is the type name. + +- ``destination``: (JSON object, required): Describes the + destination nodes for the edge type. Its format is the same as the + ``source`` key, with a JSON object that contains + ``{“column: String, and ”type“: String}``. +- ``relation``: (JSON object, required): Describes the relation + modeled by the edges. A relation can be common among all edges, or it + can have sub-types. The top-level objects for the object are: + + - ``type`` (String, required): The type of the relation described by + the edges. For example, for a source type ``user``, destination + ``movie`` we can have a relation type ``interacted_with`` for an + edge type ``user:interacted_with:movie``. + +- ``labels`` (List of JSON objects, optional): Describes the label + for the current edge type. The label object has the following + top-level objects: + + - ``column`` (String, required): The column that contains the values + for the label. Should be the empty string, ``""`` if the ``type`` + key has the value ``"link_prediction"``. + - ``type`` (String, required): The type of the learning task. Can + take the following String values: + + - ``“classification”``: An edge classification task. The values + in the specified ``column`` as treated as categorical + variables. + - ``"regression"``: An edge regression task. The values in the + specified ``column`` are treated as numerical values. + - ``"link_prediction"``: A link prediction tasks. The ``column`` + should be ``""`` in this case. + + - ``separator``: (String, optional): For multi-label classification + tasks, this separator is used within the column to list multiple + classification labels in one entry. + - ``split_rate`` (JSON object, optional): Defines a split rate + for the label items. The sum of the values for ``train``, ``val`` and + ``test`` needs to be 1.0. + + - ``train``: The percentage of the data with available labels to + assign to the train set (0.0, 1.0]. + - ``val``: The percentage of the data with available labels to + assign to the train set [0.0, 1.0). + - ``test``: The percentage of the data with available labels to + assign to the train set [0.0, 1.0). + +- ``features`` (List of JSON objects, optional)\ **:** Describes + the set of features for the current edge type. See the :ref:`features-object` section for details. + +-------------- + +Contents of a ``nodes`` configuration object +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A node configuration object in a ``nodes`` field can contain the +following top-level keys: + +.. code-block:: json + + { + "data": { + "format": "String", + "files": ["String"], + "separator": "String" + }, + "column" : "String", + "type" : "String", + "labels" : [ + { + "column": "String", + "type": "String", + "separator": "String", + "split_rate": { + "train": "Float", + "val": "Float", + "test": "Float" + } + } + ], + "features": [{}] + } + +- ``data``: (JSON object, required): Has the same definition as for + the edges object, with one top-level key for the ``format`` that + takes a String value, and one for the ``files`` that takes an array + of String values. +- ``column``: (String, required): The column in the data that + corresponds to the column that stores the node ids. +- ``type:`` (String, optional): A type name for the nodes described + in this object. If not provided the ``column`` value is used as the + node type. +- ``labels``: (List of JSON objects, optional): Similar to the + labels object defined for edges, but the values that the ``type`` can + take are different. + + - ``column`` (String, required): The name of the column that + contains the label values. + - ``type`` (String, required): Specifies that target task type which + can be: + + - ``"classification"``: A node classification task. The values in the specified + ``column`` are treated as categorical variables. + - ``"regression"``: A node regression task. The values in the specified + ``column`` are treated as float values. + + - ``separator`` (String, optional): For multi-label + classification tasks, this separator is used within the column to + list multiple classification labels in one entry. + + - e.g. with separator ``|`` we can have ``action|comedy`` as a + label value. + + - ``split_rate`` (JSON object, optional): Defines a split rate + for the label items. The sum of the values for ``train``, ``val`` and + ``test`` needs to be 1.0. + + - ``train``: The percentage of the data with available labels to + assign to the train set (0.0, 1.0]. + - ``val``: The percentage of the data with available labels to + assign to the train set [0.0, 1.0). + - ``test``: The percentage of the data with available labels to + assign to the train set [0.0, 1.0). + +- ``features`` (List of JSON objects, optional): Describes + the set of features for the current edge type. See the next section, :ref:`features-object` + for details. + +-------------- + +.. _features-object: + +Contents of a ``features`` configuration object +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +An element of a ``features`` configuration object (for edges or nodes) +can contain the following top-level keys: + +.. code-block:: json + + { + "column": "String", + "name": "String", + "transformation": { + "name": "String", + "kwargs": { + "arg_name": "" + } + }, + "data": { + "format": "String", + "files": ["String"], + "separator": "String" + } + } + +- ``column`` (String, required): The column that contains the raw + feature values in the dataset +- ``transformation`` (JSON object, optional): The type of + transformation that will be applied to the feature. For details on + the individual transformations supported see :ref:`supported-transformations`. + If this key is missing, the feature is treated as + a **no-op** feature without ``kwargs``. + + - ``name`` (String, required): The name of the transformation to be + applied. + - ``kwargs`` (JSON object, optional): A dictionary of parameter + names and values. Each individual transformation will have its own + supported parameters, described in :ref:`supported-transformations`. + +- ``name`` (String, optional): The name that will be given to the + encoded feature. If not given, **column** is used as the output name. +- ``data`` (JSON object, optional): If the data for the feature + exist in a file source that's different from the rest of the data of + the node/edge type, they are provided here. For example, you could + have each feature in one file source each: + + .. code-block:: python + + # Example node config with multiple features + { + # This is where the node structure data exist just need an id col + "data": { + "format": "parquet", + "files": ["path/to/node_ids"] + }, + "column" : "node_id", + "type" : "my_node_type", + "features": [ + # Feature 1 + { + "column": "feature_one", + # The files contain one "node_id" col and one "feature_one" col + "data": { + "format": "parquet", + "files": ["path/to/feature_one/"] + } + }, + # Feature 2 + { + "column": "feature_two", + # The files contain one "node_id" col and one "feature_two" col + "data": { + "format": "parquet", + "files": ["path/to/feature_two/"] + } + } + ] + } + + + **The file source needs + to contain the column names of the parent node/edge type to allow a + 1-1 mapping between the structure and feature files.** + + For nodes the + the feature files need to have one column named with the node id column + name, (the value of ``"column"`` for the parent node type), + for edges we need both the ``source`` and + ``destination`` columns to use as a composite key. + +.. _supported-transformations: + +Supported transformations +~~~~~~~~~~~~~~~~~~~~~~~~~ + +In this section we'll describe the transformations we support. +The name of the transformation is the value that would appear +in the ``transform['name']`` element of the feature configuration, +with the attached ``kwargs`` for the transformations that support +arguments. + +- ``no-op`` + + - Passes along the data as-is to be written to storage and + used in the partitioning pipeline. The data are assumed to be single + values or vectors of floats. + - ``kwargs``: + + - ``separator`` (String, optional): Only relevant for CSV file + sources, when a separator is used to encode vector feature + values into one column. If given, the separator will be used to + split the values in the column and create a vector column + output. Example: for a separator ``'|'`` the CSV value + ``1|2|3`` would be transformed to a vector, ``[1, 2, 3]``. + +-------------- + +Examples +~~~~~~~~ + +OAG-Paper dataset +----------------- + +.. code-block:: json + + { + "version" : "gsprocessing-v1.0", + "graph" : { + "edges" : [ + { + "data": { + "format": "csv", + "files": [ + "edges.csv" + ], + "separator": "," + }, + "source": {"column": "~from", "type": "paper"}, + "dest": {"column": "~to", "type": "paper"}, + "relation": {"type": "cites"} + } + ], + "nodes" : [ + { + "data": { + "format": "csv", + "separator": ",", + "files": [ + "node_feat.csv" + ] + }, + "type": "paper", + "column": "ID", + "labels": [ + { + "column": "field", + "type": "classification", + "separator": ";", + "split_rate": { + "train": 0.7, + "val": 0.1, + "test": 0.2 + } + } + ] + } + ] + } + } diff --git a/graphstorm-processing/docs/source/index.rst b/graphstorm-processing/docs/source/index.rst new file mode 100644 index 0000000000..cc027cbb08 --- /dev/null +++ b/graphstorm-processing/docs/source/index.rst @@ -0,0 +1,154 @@ +.. graphstorm-processing documentation master file, created by + sphinx-quickstart on Tue Aug 1 02:04:45 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to GraphStorm Distributed Data Processing documentation! +================================================= + +.. toctree:: + :maxdepth: 1 + :caption: Contents: + + Example + Distributed processing setup + Running on Amazon Sagemaker + Developer Guide + Input configuration + + +GraphStorm Distributed Data Processing allows you to process and prepare massive graph data +for training with GraphStorm. GraphStorm Processing takes care of generating +unique ids for nodes, using them to encode edge structure files, process +individual features and prepare the data to be passed into the +distributed partitioning and training pipeline of GraphStorm. + +We use PySpark to achieve +horizontal parallelism, allowing us to scale to graphs with billions of nodes +and edges. + +.. _installation-ref: + +Installation +------------ + +The project uses Python 3.9. We recommend using `PyEnv `_ +to have isolated Python installations. + +With PyEnv installed you can create and activate a Python 3.9 environment using + +.. code-block:: bash + + pyenv install 3.9 + pyenv local 3.9 + + +With a recent version of ``pip`` installed (we recommend ``pip>=21.3``), you can simply run ``pip install .`` +from the root directory of the project (``graphstorm/graphstorm-processing``), +which should install the library into your environment and pull in all dependencies. + +Install Java 8, 11, or 17 +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Spark has a runtime dependency on the JVM to run, so you'll need to ensure +Java is installed and available on your system. + +On MacOS you can install Java using ``brew``: + +.. code-block:: bash + + brew install openjdk@11 + +On Linux it will depend on your distribution's package +manager. For Ubuntu you can use: + +.. code-block:: bash + + sudo apt install openjdk-11-jdk + +On Amazon Linux 2 you can use: + +.. code-block:: bash + + sudo yum install java-11-amazon-corretto-headless + sudo yum install java-11-amazon-corretto-devel + +To check if Java is installed you can use. + +.. code-block:: bash + + java -version + + +Example +------- + +See the provided :doc:`usage/example` for an example of how to start with tabular +data and convert them into a graph representation before partitioning and +training with GraphStorm. + +Usage +----- + +To use the library to process your data, you will need to have your data +in a tabular format, and a corresponding JSON configuration file that describes the +data. The input data can be in CSV (with header(s)) or Parquet format. + +The configuration file can be in GraphStorm's GConstruct format, +with the caveat that the file paths need to be relative to the +location of the config file. See :doc:`/usage/example` for more details. + +After installing the library, executing a processing job locally can be done using: + +.. code-block:: bash + + gs-processing \ + --config-filename gconstruct-config.json \ + --input-prefix /path/to/input/data \ + --output-prefix /path/to/output/data + +Once the processing engine has processed the data, we want to ensure +they match the requirements of the DGL distributed partitioning +pipeline, so we need to run an additional script that will +make sure the produced data matches the assumptions of DGL [#f1]_. + +.. note:: + + Ensure you pass the output path of the previous step as the input path here. + +.. code-block:: bash + + gs-repartition \ + --input-prefix /path/to/output/data + +Once this script completes, the data are ready to be fed into DGL's distributed +partitioning pipeline. +See `this guide `_ +for more details on how to use GraphStorm distributed partitioning on SageMaker. + +See :doc:`/usage/example` for a detailed walkthrough of using GSProcessing to +wrangle data into a format that's ready to be consumed by the GraphStorm/DGL +partitioning pipeline. + + +Using with Amazon SageMaker +--------------------------- + +To run distributed jobs on Amazon SageMaker we will have to build a Docker image +and push it to the Amazon Elastic Container Registry, which we cover in +:doc:`usage/distributed-processing-setup` and run a SageMaker Processing +job which we describe in :doc:`/usage/amazon-sagemaker`. + + +Developer guide +--------------- + +To get started with developing the package refer to :doc:`/developer/developer-guide`. + + +.. rubric:: Footnotes + +.. [#f1] DGL expects that every file produced for a single node/edge type + has matching row counts, which is something that Spark cannot guarantee. + We use the re-partitioning script to fix this where needed in the produced + output. \ No newline at end of file diff --git a/graphstorm-processing/docs/source/usage/amazon-sagemaker.rst b/graphstorm-processing/docs/source/usage/amazon-sagemaker.rst new file mode 100644 index 0000000000..53fe61c922 --- /dev/null +++ b/graphstorm-processing/docs/source/usage/amazon-sagemaker.rst @@ -0,0 +1,154 @@ +Running distributed jobs on Amazon SageMaker +============================================ + +Once the :doc:`distributed processing setup ` is complete, we can +use the Amazon SageMaker launch scripts to launch distributed processing +jobs that use AWS resources. + +To demonstrate the usage of GSProcessing on Amazon SageMaker, we will execute the same job we used in our local +execution example, but this time use Amazon SageMaker to provide the compute resources instead of our +local machine. + +Upload data to S3 +----------------- + +Amazon SageMaker uses S3 as its storage target, so before starting +we'll need to upload our test data to S3. To do so you will need +to have read/write access to an S3 bucket, and the requisite AWS credentials +and permissions. + +We will use the AWS CLI to upload data so make sure it is +`installed `_ +and `configured `_ +in you local environment. + +Assuming ``graphstorm/graphstorm-processing`` is our current working +directory we can upload the test data to S3 using: + +.. code-block:: bash + + MY_BUCKET="enter-your-bucket-name-here" + REGION="bucket-region" # e.g. us-west-2 + aws --region ${REGION} s3 sync ./tests/resources/small_heterogeneous_graph/ \ + "${MY_BUCKET}/gsprocessing-input" + +.. note:: + + Make sure you are uploading your data to a bucket + that was created in the same region as the ECR image + you pushed in :doc:`/usage/distributed-processing-setup`. + + +Launch the GSProcessing job on Amazon SageMaker +----------------------------------------------- + +Once the data are uploaded to S3, we can use the Python script +``graphstorm-processing/scripts/run_distributed_processing.py`` +to run a GSProcessing job on Amazon SageMaker. + +For this example we'll use a SageMaker Spark cluster with 2 ``ml.t3.xlarge`` instances +since this is a tiny dataset. Using SageMaker you'll be able to create clusters +of up to 20 instances, allowing you to scale your processing to massive graphs, +using larger instances like `ml.r5.24xlarge`. + +Since we're now executing on AWS, we'll need access to an execution role +for SageMaker and the ECR image URI we created in :doc:`/usage/distributed-processing-setup`. +For instructions on how to create an execution role for SageMaker +see the `AWS SageMaker documentation `_. + +Let's set up a small bash script that will run the parametrized processing +job, followed by the re-partitioning job, both on SageMaker + +.. code-block:: bash + + ACCOUNT="enter-your-account-id-here" # e.g 1234567890 + MY_BUCKET="enter-your-bucket-name-here" + SAGEMAKER_ROLE_NAME="enter-your-sagemaker-execution-role-name-here" + REGION="bucket-region" # e.g. us-west-2 + DATASET_S3_PATH="s3://${MY_BUCKET}/gsprocessing-input" + OUTPUT_BUCKET=${MY_BUCKET} + DATASET_NAME="small-graph" + CONFIG_FILE="gconstruct-config.json" + INSTANCE_COUNT="2" + INSTANCE_TYPE="ml.t3.xlarge" + NUM_FILES="4" + + IMAGE_URI="${ACCOUNT}.dkr.ecr.${REGION}.amazonaws.com/graphstorm-processing:0.1.0" + ROLE="arn:aws:iam::${ACCOUNT}:role/service-role/${SAGEMAKER_ROLE_NAME}" + + OUTPUT_PREFIX="s3://${OUTPUT_BUCKET}/gsprocessing/${DATASET_NAME}/${INSTANCE_COUNT}x-${INSTANCE_TYPE}-${NUM_FILES}files/" + + # Conditionally delete data at output + echo "Delete all data under output path? ${OUTPUT_PREFIX}" + select yn in "Yes" "No"; do + case $yn in + Yes ) aws s3 rm --recursive ${OUTPUT_PREFIX} --quiet; break;; + No ) break;; + esac + done + + # This will run and block until the GSProcessing job is done + python scripts/run_distributed_processing.py \ + --s3-input-prefix ${DATASET_S3_PATH} \ + --s3-output-prefix ${OUTPUT_PREFIX} \ + --role ${ROLE} \ + --image ${IMAGE_URI} \ + --region ${REGION} \ + --config-filename ${CONFIG_FILE} \ + --instance-count ${INSTANCE_COUNT} \ + --instance-type ${INSTANCE_TYPE} \ + --job-name "${DATASET_NAME}-${INSTANCE_COUNT}x-${INSTANCE_TYPE//./-}-${NUM_FILES}files" \ + --num-output-files ${NUM_FILES} \ + --wait-for-job + + # This will run the follow-up re-partitioning job + python scripts/run_repartitioning.py --s3-input-prefix ${OUTPUT_PREFIX} \ + --role ${ROLE} --image ${IMAGE_URI} --config-filename "metadata.json" \ + --instance-type ${INSTANCE_TYPE} --wait-for-job + + +.. note:: + + The re-partitioning job runs on a single instance, so for large graphs you will + want to scale up to an instance with more memory to avoid memory errors. `ml.r5` instances + should allow you to re-partition graph data with billions of nodes and edges. + +The ``--num-output-files`` parameter +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can see that we provided a parameter named +``--num-output-files`` to ``run_distributed_processing.py``. This is an +important parameter, as it provides a hint to set the parallelism for Spark. + +It can safely be skipped and let Spark decide the proper value based on the cluster's +instance type and count. If setting it yourself a good value to use is +``num_instances * num_cores_per_instance * 2``, which will ensure good +utilization of the cluster resources. + + +Examine the output +------------------ + +Once both jobs are finished we can examine the output created, which +should match the output we saw when running the same jobs locally +in :doc:`/usage/example`: + + +.. code-block:: bash + + $ aws s3 ls ${OUTPUT_PREFIX} + + PRE edges/ + PRE node_data/ + PRE node_id_mappings/ + 2023-08-05 00:47:36 804 launch_arguments.json + 2023-08-05 00:47:36 11914 metadata.json + 2023-08-05 00:47:37 545 perf_counters.json + 2023-08-05 00:47:37 12082 updated_row_counts_metadata.json + +Run distributed partitioning and training on Amazon SageMaker +------------------------------------------------------------- + +With the data now processed you can follow the +`GraphStorm Amazon SageMaker guide `_ +to partition your data and run training on AWS. diff --git a/graphstorm-processing/docs/source/usage/distributed-processing-setup.rst b/graphstorm-processing/docs/source/usage/distributed-processing-setup.rst new file mode 100644 index 0000000000..785dd5a514 --- /dev/null +++ b/graphstorm-processing/docs/source/usage/distributed-processing-setup.rst @@ -0,0 +1,136 @@ +Distributed Processing setup for Amazon SageMaker +================================================= + +In this guide we'll demonstrate how to prepare your environment to run +GraphStorm Processing (GSP) jobs on Amazon SageMaker. + +We're assuming a Linux host environment used throughout +this tutorial, but other OS should work fine as well. + +The steps required are: + +- Clone the GraphStorm repository. +- Install Docker. +- Install Poetry. +- Set up AWS access. +- Build the GraphStorm Processing image using Docker. +- Push the image to the Amazon Elastic Container Registry (ECR). +- Launch a SageMaker Processing job using the example scripts. + +Clone the GraphStorm repository +------------------------------- + +You can clone the GraphStorm repository using + +.. code-block:: bash + + git clone https://github.com/awslabs/graphstorm.git + +You can then navigate to the ``graphstorm-processing/docker`` directory +that contains the relevant code: + +.. code-block:: bash + + cd ./graphstorm/graphstorm-processing/docker + +Install Docker +-------------- + +To get started with building the GraphStorm Processing image +you'll need to have the Docker engine installed. + + +To install Docker follow the instructions at the +`official site `_. + +Install Poetry +-------------- + +We use `Poetry `_ as our build +tool and for dependency management, +so we need to install it to facilitate building the library. + +You can install Poetry using: + +.. code-block:: bash + + curl -sSL https://install.python-poetry.org | python3 - + +For detailed installation instructions the +`Poetry docs `_. + + +Set up AWS access +----------------- + +To build and push the image to ECR we'll make use of the +``aws-cli`` and we'll need valid AWS credentials as well. + +To install the AWS CLI you can use: + +.. code-block:: bash + + curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" + unzip awscliv2.zip + sudo ./aws/install + +To set up credentials for use with ``aws-cli`` see the +`AWS docs `_. + +Your role should have full ECR access to be able to pull from ECR to build the image, +create an ECR repository if it doesn't exist, and push the GSProcessing image to the repository. + +Building the GraphStorm Processing image using Docker +----------------------------------------------------- + +Once Docker and Poetry are installed, and your AWS credentials are set up, +we can use the provided scripts +in the ``graphstorm-processing/docker`` directory to build the image. + +The ``build_gsprocessing_image.sh`` script can build the image +locally and tag it. For example, assuming our current directory is where +we cloned ``graphstorm/graphstorm-processing``: + +.. code-block:: bash + + bash docker/build_gsprocessing_image.sh + +The above will use the Dockerfile of the latest available GSProcessing version, +build an image and tag it as ``graphstorm-processing:${VERSION}`` where +``${VERSION}`` will take be the latest available GSProcessing version (e.g. ``0.1.0``). + +The script also supports other arguments to customize the image name, +tag and other aspects of the build. See ``bash docker/build_gsprocessing_image.sh --help`` +for more information. + +Push the image to the Amazon Elastic Container Registry (ECR) +------------------------------------------------------------- + +Once the image is built we can use the ``push_gsprocessing_image.sh`` script +that will create an ECR repository if needed and push the image we just built. + +The script does not require any arguments and by default will +create a repository named ``graphstorm-processing`` in the ``us-west-2`` region, +on the default AWS account ``aws-cli`` is configured for, +and push the image tagged with the latest version of GSProcessing. + +The script supports 4 optional arguments: + +1. Image name/repository. (``-i/--image``) Default: ``graphstorm-processing`` +2. Image tag. Default: (``-v/--version``) ```` e.g. ``0.1.0``. +3. ECR region. Default: (``-r/--region``) ``us-west-2``. +4. AWS Account ID. (``-a/--account``) Default: Uses the account ID detected by the ``aws-cli``. + +Example: + +.. code-block:: bash + + bash push_gsprocessing_image.sh -i "graphstorm-processing" -v "0.1.0" -r "us-west-2" -a "1234567890" + + +Launch a SageMaker Processing job using the example scripts. +------------------------------------------------------------ + +Once the setup is complete, you can follow the +:doc:`SageMaker Processing job guide ` +to launch your distributed processing job using AWS resources. diff --git a/graphstorm-processing/docs/source/usage/example.rst b/graphstorm-processing/docs/source/usage/example.rst new file mode 100644 index 0000000000..ab25b5a1f1 --- /dev/null +++ b/graphstorm-processing/docs/source/usage/example.rst @@ -0,0 +1,268 @@ +GraphStorm Processing example +============================= + +To demonstrate how to use the library locally we will +use the same example data as we use in our +unit tests, which you can find in the project's repository, +under ``graphstorm/graphstorm-processing/tests/resources/small_heterogeneous_graph``. + +Install example dependencies +---------------------------- + +To run the local example you will need to install the GSProcessing +library to your Python environment, and you'll need to clone the +GraphStorm repository to get access to the data. + +Follow the :ref:`installation-ref` guide to install the GSProcessing library. + +You can clone the repository using + +.. code-block:: bash + + git clone https://github.com/awslabs/graphstorm.git + +You can then navigate to the ``graphstorm-processing/`` directory +that contains the relevant data: + +.. code-block:: bash + + cd ./graphstorm/graphstorm-processing/ + + +Expected file inputs and configuration +-------------------------------------- + +GSProcessing expects the input files to be in specific format that will allow +us to perform the processing and prepare the data for partitioning and training. + +The data files are expected to be: + +* Tabular data files. We support CSV-with-header format, or in Parquet format. + The files can be split (multiple parts), or a single file. +* Available on a local file system or on S3. +* One tabular file source per edge and node type. For example, for a particular edge + type, all node identifiers (source, destination), features, and labels should + exist as columns in a single file source. + +Apart from the data, GSProcessing also requires a configuration file that describes the +data and the transformations we will need to apply to the features and any encoding needed for +labels. +We support both the `GConstruct configuration format `_ +, and the library's own GSProcessing format, described in :doc:`/developer/input-configuration`. + +.. note:: + We expect end users to only provide a GConstruct configuration file, + and only use the configuration format of GSProcessing as an intermediate + layer to decouple the two projects. + + Developers who are looking to use GSProcessing + as their backend processing engine can either use the GSProcessing configuration + format directly, or translate their own configuration format to GSProcessing, + as we do with GConstruct. + +For a detailed description of all the entries of the GSProcessing configuration file see +:doc:`/developer/input-configuration`. + +Relative file paths required +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The one difference with single-instance GConstruct files, +is that we require that the file paths listed in the configuration file are +`relative to the location of the configuration file.` Specifically: + +* All file paths listed **must not** start with ``/``. +* Assuming the configuration file is under ``$PATH``, and a filepath is listed as ``${FILEPATH}`` + in the configuration file, the corresponding file is expected to exist at ``${PATH}/${FILEPATH}``. + +For example: + +.. code-block:: bash + + > pwd + /home/path/to/data/ # This is the current working directory (cwd) + > ls + gconstruct-config.json edge_data # These are the files under the cwd + > ls edge_data/ # These are the files under the edge_data directory + movie-included_in-genre.csv + +The contents of the ``gconstruct-config.json`` can be: + +.. code-block:: python + + { + "edges" : [ + { + # Note that the file is a relative path + "files": ["edges/movie-included_in-genre.csv"], + "format": { + "name": "csv", + "separator" : "," + } + # [...] Other edge config values + } + ] + } + +Given the above we can run a job with local input data as: + +.. code-block:: bash + + > gs-processing --input-data /home/path/to/data \ + --config-filename gconstruct-config.json + +The benefit with using relative paths is that we can move the same files +to any location, including S3, and run the same job without making changes to the config +file: + +.. code-block:: bash + + # Move all files to new directory + > mv /home/path/to/data /home/new-path/to/data + # After moving all the files we can still use the same config + > gs-processing --input-data /home/new-path/to/data \ + --config-filename gconstruct-config.json + + # Upload data to S3 + > aws s3 sync /home/new-path/to/data s3://my-bucket/data/ + # We can still use the same config, just change the prefix to an S3 path + > python run_distributed_processing.py --input-data s3://my-bucket/data \ + --config-filename gconstruct-config.json + +Node files are optional +^^^^^^^^^^^^^^^^^^^^^^^ + +GSProcessing does not require node files to be provided for +every node type. If a node type appears in one of the edges, +its unique node identifiers will be determined by the edge files. + +In the example GConstruct file above (`gconstruct-config.json`), the node ids for the node types +``movie`` and ``genre`` will be extracted from the edge list provided. + +Example data and configuration +------------------------------ + +For this example we use a small heterogeneous graph inspired by the Movielens dataset. +You can see the configuration file under +``graphstorm-processing/tests/resources/small_heterogeneous_graph/gconstruct-config.json`` + +We have 4 node types, ``movie``, ``genre``, ``director``, and ``user``. The graph has 3 +edge types, ``movie:included_in:genre``, ``user:rated:movie``, and ``director:directed:movie``. + +We include one ``no-op`` feature, ``age``, that we directly pass to the output without any transformation, +and one label, ``gender``, that we transform to prepare the data for a node classification task. + + +Run a GSProcessing job locally +------------------------------ + +While GSProcessing is designed to run on distributed clusters, +we can also run small jobs in a local environment, using a local Spark instance. + +To do so, we will be using the ``gs-processing`` entry point, +to process the data and create the output on our local storage. + +We will provide an input and output prefix for our data, passing +local paths to the script. + +We also provide the argument ``--num-output-files`` that instructs PySpark +to try and create output with 4 partitions [#f1]_. + +Assuming our working directory is ``graphstorm/graphstorm-processing/`` +we can use the following command to run the processing job locally: + +.. code-block:: bash + + gs-processing --config-filename gconstruct-config.json \ + --input-prefix ./tests/resources/small_heterogeneous_graph \ + --output-prefix /tmp/gsprocessing-example/ \ + --num-output-files 4 + + +To finalize processing and to wrangle the data into the structure that +DGL distributed partitioning expects, we need an additional step that +guarantees the data conform to the expectations of DGL: + +.. code-block:: bash + + gs-repartition --input-prefix /tmp/gsprocessing-example/ + + +Examining the job output +------------------------ + +Once the processing and re-partitioning jobs are done, +we can examine the outputs they created. The output will be +compatible with the `Chunked Graph Format of DistDGL `_ +and can be used downstream to create a partitioned graph. + +.. code-block:: bash + + $ cd /tmp/gsprocessing-example + $ ls + + edges/ launch_arguments.json metadata.json node_data/ + node_id_mappings/ perf_counters.json updated_row_counts_metadata.json + +We have a few JSON files and the data directories containing +the graph structure, features, and labels. In more detail: + +* ``launch_arguments.json``: Contains the arguments that were used + to launch the processing job, allowing you to check the parameters after the + job finishes. +* ``updated_row_counts_metadata.json``: + This file is meant to be used as the input configuration for the + distributed partitioning pipeline. ``repartition_files.py`` produces + this file using the original ``metadata.json`` file as input. +* ``metadata.json``: Created by ``gs-processing`` and used as input + for ``repartition_files.py``, can be removed once that script has run. +* ``perf_counters.json``: A JSON file that contains runtime measurements + for the various components of GSProcessing. Can be used to profile the + application and discover bottlenecks. + +The directories created contain: + +* ``edges``: Contains the edge structures, one sub-directory per edge + type. Each edge file will contain two columns, the source and destination + `numerical` node id, named ``src_int_id`` and ``dist_int_id`` respectively. +* ``node_data``: Contains the features for the nodes, one sub-directory + per node type. Each file will contain one column named after the original + feature name that contains the value of the feature (could be a scalar or a vector). +* ``node_id_mappings``: Contains mappings from the original node ids to the + ones created by the processing job. This mapping would allow you to trace + back predictions to the original nodes/edges. The files will have two columns, + ``node_str_id`` that contains the original string ID of the node, and ``node_int_id`` + that contains the numerical id that the string id was mapped to. + +If the graph had included edge features they would appear +in an ``edge_data`` directory. + +.. note:: + + It's important to note that files for edges and edge data will have the + same order and row counts per file, as expected by DistDGL. Similarly, + all node feature files will have the same order and row counts, where + the first row corresponds to the feature value for node id 0, the second + for node id 1 etc. + + +At this point you can use the DGL distributed partitioning pipeline +to partition your data, as described in the +`DGL documentation `_ + +To simplify the process of partitioning and training, without the need +to manage your own infrastructure, we recommend using GraphStorm's +`SageMaker wrappers `_ +that do all the hard work for you and allow +you to focus on model development. + +To run GSProcessing jobs on Amazon SageMaker we'll need to follow +:doc:`/usage/distributed-processing-setup` to set up our environment +and :doc:`/usage/amazon-sagemaker` to execute the job. + + +.. rubric:: Footnotes + + +.. [#f1] Note that this is just a hint to the Spark engine, and it's + not guaranteed that the number of output partitions will always match + the requested value. \ No newline at end of file diff --git a/graphstorm-processing/graphstorm_processing/distributed_executor.py b/graphstorm-processing/graphstorm_processing/distributed_executor.py index ef2db20930..61451a5a5e 100644 --- a/graphstorm-processing/graphstorm_processing/distributed_executor.py +++ b/graphstorm-processing/graphstorm_processing/distributed_executor.py @@ -168,21 +168,31 @@ def __init__( dataset_config_dict: Dict[str, Any] = json.load(f) if "version" in dataset_config_dict: - self.config_version = dataset_config_dict["version"] - if self.config_version == "gsprocessing-v1.0": + config_version = dataset_config_dict["version"] + if config_version == "gsprocessing-v1.0": logging.info("Parsing config file as GSProcessing config") self.graph_config_dict = dataset_config_dict["graph"] - elif self.config_version == "gconstruct-v1.0": + elif config_version == "gconstruct-v1.0": logging.info("Parsing config file as GConstruct config") converter = GConstructConfigConverter() self.graph_config_dict = converter.convert_to_gsprocessing(dataset_config_dict)[ "graph" ] else: - logging.warning("Unrecognized version name: %s", self.config_version) + logging.warning("Unrecognized version name: %s", config_version) + try: + converter = GConstructConfigConverter() + self.graph_config_dict = converter.convert_to_gsprocessing(dataset_config_dict)[ + "graph" + ] + except Exception: # pylint: disable=broad-exception-caught + logging.warning("Could not parse config as GConstruct, trying GSProcessing") + assert ( + "graph" in dataset_config_dict + ), "Top-level element 'graph' needs to exist in a GSProcessing config" + self.graph_config_dict = dataset_config_dict["graph"] else: # Older versions of GConstruct configs might be missing a version entry - self.config_version = "gconstruct" converter = GConstructConfigConverter() self.graph_config_dict = converter.convert_to_gsprocessing(dataset_config_dict)["graph"] diff --git a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py index bb844d1ef0..93a5d08109 100644 --- a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py +++ b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py @@ -315,7 +315,7 @@ def _initialize_metadata_dict( # Add original and reverse edge types edge_types.append(f"{src_type}:{rel_type}:{dst_type}") if self.add_reverse_edges: - edge_types.append(f"{dst_type}:rev-{rel_type}:{src_type}") + edge_types.append(f"{dst_type}:{rel_type}-rev:{src_type}") metadata_dict["edge_type"] = edge_types metadata_dict["node_type"] = sorted(node_type_set) @@ -1072,7 +1072,7 @@ def write_edge_structure( f"{edge_config.src_ntype}:{edge_config.get_relation_name()}:{edge_config.dst_ntype}" ) rev_edge_type = ( - f"{edge_config.dst_ntype}:rev-{edge_config.get_relation_name()}:{edge_config.src_ntype}" + f"{edge_config.dst_ntype}:{edge_config.get_relation_name()}-rev:{edge_config.src_ntype}" ) src_node_id_mapping = ( @@ -1223,7 +1223,7 @@ def process_edge_data(self, edge_configs: Sequence[EdgeConfig]) -> Tuple[Dict, D ) reverse_edge_type = ( f"{edge_config.dst_ntype}" - f":rev-{edge_config.get_relation_name()}" + f":{edge_config.get_relation_name()}-rev" f":{edge_config.src_ntype}" ) logging.info("Processing edge type '%s'...", edge_type) diff --git a/graphstorm-processing/graphstorm_processing/repartition_files.py b/graphstorm-processing/graphstorm_processing/repartition_files.py index d0105267f0..1d13d34253 100644 --- a/graphstorm-processing/graphstorm_processing/repartition_files.py +++ b/graphstorm-processing/graphstorm_processing/repartition_files.py @@ -833,7 +833,7 @@ def main(): for type_idx, (type_name, type_data_dict) in enumerate(edge_data_meta.items()): src, relation, dst = type_name.split(":") - if relation.startswith("rev-"): + if relation.endswith("-rev"): # Reverse edge types do not have their own data, # and if needed we re-partition their structure while # handling the "regular" edge type. @@ -845,7 +845,7 @@ def main(): type_name, ) continue - reverse_edge_type_name = f"{dst}:rev-{relation}:{src}" + reverse_edge_type_name = f"{dst}:{relation}-rev:{src}" most_frequent_counts = list(edge_row_counts_frequencies[type_name].most_common(1)[0][0]) repartitioner = ParquetRepartitioner( input_prefix, filesystem_type, region, verify_outputs=True diff --git a/graphstorm-processing/tests/resources/repartitioning/partitioned_metadata.json b/graphstorm-processing/tests/resources/repartitioning/partitioned_metadata.json index 3ddaa0e2ca..bbdf13eb03 100644 --- a/graphstorm-processing/tests/resources/repartitioning/partitioned_metadata.json +++ b/graphstorm-processing/tests/resources/repartitioning/partitioned_metadata.json @@ -1,7 +1,7 @@ { "edge_type": [ "src:dummy_type:dst", - "dst:rev-dummy_type:src" + "dst:dummy_type-rev:src" ], "edges": { "src:dummy_type:dst": { @@ -18,7 +18,7 @@ "delimiter": "" } }, - "dst:rev-dummy_type:src": { + "dst:dummy_type-rev:src": { "data": [ "edges/dummy_type/parquet/part-00000.parquet", "edges/dummy_type/parquet/part-00001.parquet", @@ -78,7 +78,7 @@ } } }, - "dst:rev-dummy_type:src": { + "dst:dummy_type-rev:src": { "label": { "data":[ "edge_data/dummy_type-label/parquet/part-00000.parquet", diff --git a/graphstorm-processing/tests/test_dist_heterogenous_loader.py b/graphstorm-processing/tests/test_dist_heterogenous_loader.py index c005a6527d..13e5038f9f 100644 --- a/graphstorm-processing/tests/test_dist_heterogenous_loader.py +++ b/graphstorm-processing/tests/test_dist_heterogenous_loader.py @@ -165,11 +165,11 @@ def verify_integ_test_output( assert metadata["node_type"] == ["director", "genre", "movie", "user"] assert metadata["edge_type"] == [ "movie:included_in:genre", - "genre:rev-included_in:movie", + "genre:included_in-rev:movie", "user:rated:movie", - "movie:rev-rated:user", + "movie:rated-rev:user", "director:directed:movie", - "movie:rev-directed:director", + "movie:directed-rev:director", ] expected_node_counts = {"director": 3, "genre": 2, "movie": 4, "user": 5} @@ -182,11 +182,11 @@ def verify_integ_test_output( expected_edge_counts = { "movie:included_in:genre": 4, - "genre:rev-included_in:movie": 4, + "genre:included_in-rev:movie": 4, "user:rated:movie": 6, - "movie:rev-rated:user": 6, + "movie:rated-rev:user": 6, "director:directed:movie": 4, - "movie:rev-directed:director": 4, + "movie:directed-rev:director": 4, } for edge_type in metadata["edge_type"]: @@ -266,11 +266,11 @@ def test_load_dist_hgl_without_labels(dghl_loader_no_label: DistHeterogeneousGra "task_type": "link_predict", "etype_label": [ "movie:included_in:genre", - "genre:rev-included_in:movie", + "genre:included_in-rev:movie", "user:rated:movie", - "movie:rev-rated:user", + "movie:rated-rev:user", "director:directed:movie", - "movie:rev-directed:director", + "movie:directed-rev:director", ], "etype_label_property": [], "ntype_label": [], @@ -283,18 +283,18 @@ def test_load_dist_hgl_without_labels(dghl_loader_no_label: DistHeterogeneousGra expected_edge_data = { "user:rated:movie": {"train_mask", "val_mask", "test_mask"}, - "movie:rev-rated:user": {"train_mask", "val_mask", "test_mask"}, + "movie:rated-rev:user": {"train_mask", "val_mask", "test_mask"}, "movie:included_in:genre": {"train_mask", "val_mask", "test_mask"}, - "genre:rev-included_in:movie": {"train_mask", "val_mask", "test_mask"}, + "genre:included_in-rev:movie": {"train_mask", "val_mask", "test_mask"}, "director:directed:movie": {"train_mask", "val_mask", "test_mask"}, - "movie:rev-directed:director": {"train_mask", "val_mask", "test_mask"}, + "movie:directed-rev:director": {"train_mask", "val_mask", "test_mask"}, } for edge_type in metadata["edge_data"]: assert metadata["edge_data"][edge_type].keys() == expected_edge_data[edge_type] - if not "rev-" in edge_type: + if not "-rev" in edge_type: src_type, relation, dst_type = edge_type.split(":") - rev_type = f"{dst_type}:rev-{relation}:{src_type}" + rev_type = f"{dst_type}:{relation}-rev:{src_type}" assert ( metadata["edge_data"][rev_type]["train_mask"] == metadata["edge_data"][edge_type]["train_mask"] diff --git a/graphstorm-processing/tests/test_repartition_files.py b/graphstorm-processing/tests/test_repartition_files.py index e2a08968db..eb9dc45bba 100644 --- a/graphstorm-processing/tests/test_repartition_files.py +++ b/graphstorm-processing/tests/test_repartition_files.py @@ -217,8 +217,8 @@ def test_verify_metadata_only_edge_data(): row_counts = [10, 10, 10, 10, 10] original_metadata_dict["edge_data"]["src:dummy_type:dst"]["label"]["row_counts"] = row_counts original_metadata_dict["edges"]["src:dummy_type:dst"]["row_counts"] = row_counts - original_metadata_dict["edges"].pop("dst:rev-dummy_type:src") - original_metadata_dict["edge_data"].pop("dst:rev-dummy_type:src") + original_metadata_dict["edges"].pop("dst:dummy_type-rev:src") + original_metadata_dict["edge_data"].pop("dst:dummy_type-rev:src") # Ensure success when counts match repartition_files.verify_metadata( diff --git a/python/graphstorm/config/argument.py b/python/graphstorm/config/argument.py index 0a39bd61e0..743ce237c6 100644 --- a/python/graphstorm/config/argument.py +++ b/python/graphstorm/config/argument.py @@ -132,9 +132,11 @@ def __init__(self, cmd_args): if hasattr(cmd_args, "logging_level") else logging.INFO log_file = cmd_args.logging_file if hasattr(cmd_args, "logging_file") else None if log_file is None: - logging.basicConfig(level=log_level) + # We need to force the logging to reset the existing logging handlers + # in order to make sure this config is effective. + logging.basicConfig(level=log_level, force=True) else: - logging.basicConfig(filename=log_file, level=log_level) + logging.basicConfig(filename=log_file, level=log_level, force=True) self.yaml_paths = cmd_args.yaml_config_file # Load all arguments from yaml config @@ -293,6 +295,7 @@ def verify_arguments(self, is_train): _ = self.restore_model_path _ = self.restore_optimizer_path _ = self.save_embed_path + _ = self.save_embed_format # Model architecture _ = self.dropout @@ -853,6 +856,19 @@ def save_embed_path(self): return self._save_embed_path return None + @property + def save_embed_format(self): + """ Specify the format of saved embeddings. + """ + # pylint: disable=no-member + if hasattr(self, "_save_embed_format"): + assert self._save_embed_format in ["pytorch", "hdf5"], \ + f"{self._save_embed_format} is not supported for save_embed_format." \ + f"Supported format ['pytorch', 'hdf5']." + return self._save_embed_format + # default to be 'pytorch' + return "pytorch" + @property def save_model_path(self): """ Path to save the model. @@ -2012,6 +2028,8 @@ def _add_output_args(parser): group.add_argument("--save-embed-path", type=str, default=argparse.SUPPRESS, help="Save the embddings in the specified directory. " "Use none to turn off embedding saveing") + group.add_argument("--save-embed-format", type=str, default=argparse.SUPPRESS, + help="Specify the format for saved embeddings. Valid format: ['pytorch', 'hdf5']") group.add_argument('--save-model-frequency', type=int, default=argparse.SUPPRESS, help='Save the model every N iterations.') group.add_argument('--save-model-path', type=str, default=argparse.SUPPRESS, diff --git a/python/graphstorm/data/utils.py b/python/graphstorm/data/utils.py index 0f670ce15b..dbd82a518a 100644 --- a/python/graphstorm/data/utils.py +++ b/python/graphstorm/data/utils.py @@ -236,6 +236,19 @@ def alltoall_cpu(rank, world_size, output_tensor_list, input_tensor_list): for i in range(world_size): dist.scatter(output_tensor_list[i], input_tensor_list if i == rank else None, src=i) +def alltoallv_nccl(output_tensor_list, input_tensor_list): + """ Each process scatters list of input tensors to all processes in a cluster + and return gathered list of tensors in output list using nccl backend. + + Parameters + ---------- + output_tensor_list : List of tensor + The received tensors + input_tensor_list : List of tensor + The tensors to exchange + """ + th.distributed.all_to_all(output_tensor_list, input_tensor_list) + def alltoallv_cpu(rank, world_size, output_tensor_list, input_tensor_list): """Each process scatters list of input tensors to all processes in a cluster and return gathered list of tensors in output list. diff --git a/python/graphstorm/dataloading/dataloading.py b/python/graphstorm/dataloading/dataloading.py index e155f5f063..fdb6ffc0ac 100644 --- a/python/graphstorm/dataloading/dataloading.py +++ b/python/graphstorm/dataloading/dataloading.py @@ -158,6 +158,10 @@ def fanout(self): class GSgnnEdgeDataLoader(GSgnnEdgeDataLoaderBase): """ The minibatch dataloader for edge prediction + GSgnnEdgeDataLoader samples GraphStorm edge dataset into an iterable over mini-batches + of samples. Both source and destination nodes are included in the batch_graph, which + will be used by GraphStorm Trainers and Inferrers. + Parameters ------------ dataset: GSgnnEdgeData @@ -182,6 +186,23 @@ class GSgnnEdgeDataLoader(GSgnnEdgeDataLoaderBase): The node types that requires to construct node features. construct_feat_fanout : int The fanout required to construct node features. + + Examples + ------------ + To train a 2-layer GNN for edge prediction on a set of edges ``target_idx`` on + a graph where each nodes takes messages from 15 neighbors on the first layer + and 10 neighbors on the second. + + .. code:: python + + from graphstorm.dataloading import GSgnnEdgeTrainData + from graphstorm.dataloading import GSgnnEdgeDataLoader + from graphstorm.trainer import GSgnnEdgePredictionTrainer + + ep_data = GSgnnEdgeTrainData(...) + ep_dataloader = GSgnnEdgeDataLoader(ep_data, target_idx, fanout=[15, 10], batch_size=128) + ep_trainer = GSgnnEdgePredictionTrainer(...) + ep_trainer.fit(ep_dataloader, 10) """ def __init__(self, dataset, target_idx, fanout, batch_size, device='cpu', train_task=True, reverse_edge_types_map=None, @@ -357,7 +378,11 @@ def target_eidx(self): class GSgnnLinkPredictionDataLoader(GSgnnLinkPredictionDataLoaderBase): """ Link prediction minibatch dataloader - The negative edges are sampled uniformly. + GSgnnLinkPredictionDataLoader samples GraphStorm edge dataset into an iterable over mini-batches + of samples. In each batch, pos_graph and neg_graph are sampled subgraph for positive and + negative edges, which will be used by GraphStorm Trainers and Inferrers. Given a positive edge, + a negative edge is composed of the source node and a random negative destination nodes + according to a uniform distribution. Argument -------- @@ -387,6 +412,24 @@ class GSgnnLinkPredictionDataLoader(GSgnnLinkPredictionDataLoaderBase): The node types that requires to construct node features. construct_feat_fanout : int The fanout required to construct node features. + + Examples + ------------ + To train a 2-layer GNN for link prediction on a set of positive edges ``target_idx`` on + a graph where each nodes takes messages from 15 neighbors on the first layer + and 10 neighbors on the second. We use 10 negative edges per positive in this example. + + .. code:: python + + from graphstorm.dataloading import GSgnnEdgeTrainData + from graphstorm.dataloading import GSgnnLinkPredictionDataLoader + from graphstorm.trainer import GSgnnLinkPredictionTrainer + + lp_data = GSgnnEdgeTrainData(...) + lp_dataloader = GSgnnLinkPredictionDataLoader(lp_data, target_idx, fanout=[15, 10], + num_negative_edges=10, batch_size=128) + lp_trainer = GSgnnLinkPredictionTrainer(...) + lp_trainer.fit(lp_dataloader, 10) """ def __init__(self, dataset, target_idx, fanout, batch_size, num_negative_edges, device='cpu', train_task=True, reverse_edge_types_map=None, exclude_training_targets=False, @@ -961,6 +1004,10 @@ def fanout(self): class GSgnnNodeDataLoader(GSgnnNodeDataLoaderBase): """ Minibatch dataloader for node tasks + + GSgnnNodeDataLoader samples GraphStorm node dataset into an iterable over mini-batches of + samples including target nodes and sampled neighbor nodes, which will be used by GraphStorm + Trainers and Inferrers. Parameters ---------- @@ -980,6 +1027,23 @@ class GSgnnNodeDataLoader(GSgnnNodeDataLoaderBase): The node types that requires to construct node features. construct_feat_fanout : int The fanout required to construct node features. + + Examples + ---------- + To train a 2-layer GNN for node classification on a set of nodes ``target_idx`` on + a graph where each nodes takes messages from 15 neighbors on the first layer + and 10 neighbors on the second. + + .. code:: python + + from graphstorm.dataloading import GSgnnNodeTrainData + from graphstorm.dataloading import GSgnnNodeDataLoader + from graphstorm.trainer import GSgnnNodePredictionTrainer + + np_data = GSgnnNodeTrainData(...) + np_dataloader = GSgnnNodeDataLoader(np_data, target_idx, fanout=[15, 10], batch_size=128) + np_trainer = GSgnnNodePredictionTrainer(...) + np_trainer.fit(np_dataloader, 10) """ def __init__(self, dataset, target_idx, fanout, batch_size, device, train_task=True, construct_feat_ntype=None, construct_feat_fanout=5): diff --git a/python/graphstorm/dataloading/dataset.py b/python/graphstorm/dataloading/dataset.py index 97441eb4fa..b06d6a398c 100644 --- a/python/graphstorm/dataloading/dataset.py +++ b/python/graphstorm/dataloading/dataset.py @@ -139,6 +139,13 @@ def __init__(self, graph_name, part_config, node_feat_field, edge_feat_field): self._val_idxs = {} self._test_idxs = {} + if get_rank() == 0: + g = self._g + for ntype in g.ntypes: + logging.debug("%s has %d nodes.", ntype, g.number_of_nodes(ntype)) + for etype in g.canonical_etypes: + logging.debug("%s has %d edges.", str(etype), g.number_of_edges(etype)) + # Use wholegraph for feature transfer if is_distributed() and use_wholegraph(part_config): logging.info("Allocate features with Wholegraph") @@ -275,7 +282,7 @@ def get_node_feats(self, input_nodes, device='cpu'): feat_field=self._node_feat_field) def get_edge_feats(self, input_edges, edge_feat_field, device='cpu'): - """ Get the node features + """ Get the edge features Parameters ---------- @@ -382,7 +389,9 @@ def test_idxs(self): return self._test_idxs class GSgnnEdgeTrainData(GSgnnEdgeData): - """ Edge prediction training data + r""" Edge prediction training data + + The GSgnnEdgeTrainData prepares the data for training edge prediction. Parameters ---------- @@ -404,6 +413,17 @@ class GSgnnEdgeTrainData(GSgnnEdgeData): different feature names. decoder_edge_feat: str or dict of list of str Edge features used by decoder + + Examples + ---------- + .. code:: python + + from graphstorm.dataloading import GSgnnEdgeTrainData + from graphstorm.dataloading import GSgnnEdgeDataLoader + ep_data = GSgnnEdgeTrainData(graph_name='dummy', part_config=part_config, + train_etypes=[('n1', 'e1', 'n2')], label_field='label', + node_feat_field='node_feat', edge_feat_field='edge_feat') + ep_dataloader = GSgnnEdgeDataLoader(ep_data, target_idx={"e1":[0]}, fanout=[15, 10], batch_size=128) """ def __init__(self, graph_name, part_config, train_etypes, eval_etypes=None, label_field=None, node_feat_field=None, edge_feat_field=None, @@ -543,7 +563,9 @@ def pos_graph_feat_field(self): return self._pos_graph_feat_field class GSgnnEdgeInferData(GSgnnEdgeData): - """ Edge prediction inference data + r""" Edge prediction inference data + + GSgnnEdgeInferData prepares the data for edge prediction inference. Parameters ---------- @@ -563,6 +585,17 @@ class GSgnnEdgeInferData(GSgnnEdgeData): different feature names. decoder_edge_feat: str or dict of list of str Edge features used by decoder + + Examples + ---------- + .. code:: python + + from graphstorm.dataloading import GSgnnEdgeInferData + from graphstorm.dataloading import GSgnnEdgeDataLoader + ep_data = GSgnnEdgeInferData(graph_name='dummy', part_config=part_config, + eval_etypes=[('n1', 'e1', 'n2')], label_field='label', + node_feat_field='node_feat', edge_feat_field='edge_feat') + ep_dataloader = GSgnnEdgeDataLoader(ep_data, target_idx={"e1":[0]}, fanout=[15, 10], batch_size=128) """ def __init__(self, graph_name, part_config, eval_etypes, label_field=None, node_feat_field=None, edge_feat_field=None, @@ -599,13 +632,14 @@ def prepare_data(self, g): for canonical_etype in self.eval_etypes: if 'test_mask' in g.edges[canonical_etype].data: # test_mask exists - # we will do evaluation. + # we will do evaluation or inference on test data. test_idx = dgl.distributed.edge_split( g.edges[canonical_etype].data['test_mask'], pb, etype=canonical_etype, force_even=True) # If there are test data globally, we should add them to the dict. if test_idx is not None and dist_sum(len(test_idx)) > 0: test_idxs[canonical_etype] = test_idx + infer_idxs[canonical_etype] = test_idx else: # Inference only # we will do inference on the entire edge set @@ -706,7 +740,9 @@ def test_idxs(self): return self._test_idxs class GSgnnNodeTrainData(GSgnnNodeData): - """ Training data for node tasks + r""" Training data for node tasks + + GSgnnNodeTrainData prepares the data for training node prediction. Parameters ---------- @@ -726,6 +762,18 @@ class GSgnnNodeTrainData(GSgnnNodeData): edge_feat_field : str or dict of list of str The field of the edge features. It's a dict if different edge types have different feature names. + + Examples + ---------- + .. code:: python + + from graphstorm.dataloading import GSgnnNodeTrainData + from graphstorm.dataloading import GSgnnNodeDataLoader + + np_data = GSgnnNodeTrainData(graph_name='dummy', part_config=part_config, + train_ntypes=['n1'], label_field='label', + node_feat_field='feat') + np_dataloader = GSgnnNodeDataLoader(np_data, target_idx={'n1':[0]}, fanout=[15, 10], batch_size=128) """ def __init__(self, graph_name, part_config, train_ntypes, eval_ntypes=None, label_field=None, node_feat_field=None, edge_feat_field=None): @@ -833,7 +881,9 @@ def eval_ntypes(self): return self._eval_ntypes class GSgnnNodeInferData(GSgnnNodeData): - """ Inference data for node tasks + r""" Inference data for node tasks + + GSgnnNodeInferData prepares the data for node prediction inference. Parameters ---------- @@ -851,6 +901,18 @@ class GSgnnNodeInferData(GSgnnNodeData): edge_feat_field : str or dict of list of str The field of the edge features. It's a dict if different edge types have different feature names. + + Examples + ---------- + .. code:: python + + from graphstorm.dataloading import GSgnnNodeInferData + from graphstorm.dataloading import + + np_data = GSgnnNodeInferData(graph_name='dummy', part_config=part_config, + eval_ntypes=['n1'], label_field='label', + node_feat_field='feat') + np_dataloader = GSgnnNodeDataLoader(np_data, target_idx={'n1':[0]}, fanout=[15, 10], batch_size=128) """ def __init__(self, graph_name, part_config, eval_ntypes, label_field=None, node_feat_field=None, edge_feat_field=None): @@ -885,13 +947,14 @@ def prepare_data(self, g): if 'trainer_id' in g.nodes[ntype].data else None if 'test_mask' in g.nodes[ntype].data: # test_mask exists - # we will do evaluation. + # we will do evaluation or inference on test data. test_idx = dgl.distributed.node_split(g.nodes[ntype].data['test_mask'], pb, ntype=ntype, force_even=True, node_trainer_ids=node_trainer_ids) # If there are test data globally, we should add them to the dict. if test_idx is not None and dist_sum(len(test_idx)) > 0: test_idxs[ntype] = test_idx + infer_idxs[ntype] = test_idx elif test_idx is None: logging.warning("%s does not contains test data, skip testing %s", ntype, ntype) diff --git a/python/graphstorm/eval/utils.py b/python/graphstorm/eval/utils.py index 60febc1689..847919171b 100644 --- a/python/graphstorm/eval/utils.py +++ b/python/graphstorm/eval/utils.py @@ -18,7 +18,7 @@ import torch as th from ..utils import get_backend, is_distributed -from ..data.utils import alltoallv_cpu +from ..data.utils import alltoallv_cpu, alltoallv_nccl def calc_distmult_pos_score(h_emb, t_emb, r_emb, device=None): """ Calculate DistMulti Score for positive pairs @@ -321,7 +321,7 @@ def broadcast_data(rank, world_size, data_tensor): if get_backend() == "gloo": alltoallv_cpu(rank, world_size, gather_list, data_tensors) else: #get_backend() == "nccl" - th.distributed.all_to_all(gather_list, data_tensors) + alltoallv_nccl(gather_list, data_tensors) data_tensor = th.cat(gather_list, dim=0) return data_tensor diff --git a/python/graphstorm/gconstruct/file_io.py b/python/graphstorm/gconstruct/file_io.py index 0deaa15ef9..bc90045de3 100644 --- a/python/graphstorm/gconstruct/file_io.py +++ b/python/graphstorm/gconstruct/file_io.py @@ -266,15 +266,45 @@ def read_data_hdf5(data_file, data_fields=None, in_mem=True): data[name] = f[name][:] if in_mem else HDF5Array(f[name], handle) return data +def stream_dist_tensors_to_hdf5(data, data_file, chunk_size=100000): + """ Stream write dict of dist tensor into a HDF5 file. + + Parameters + ---------- + data : dict of dist tensor + The data to be saved to the hdf5 file. + data_file : str + The file name of the hdf5 file. + chunk_size : int + The size of a chunk to extract from dist tensor. + """ + chunk_size = 100000 + with h5py.File(data_file, "w") as f: + for key, val in data.items(): + arr = f.create_dataset(key, val.shape, dtype=np.array(val[0]).dtype) + if len(val) > chunk_size: + num_chunks = len(val) // chunk_size + remainder = len(val) % chunk_size + for i in range(num_chunks): + # extract a chunk from dist tensor + chunk_val = np.array(val[i*chunk_size:(i+1)*chunk_size]) + arr[i*chunk_size:(i+1)*chunk_size] = chunk_val + # need to write remainder + if remainder != 0: + remainder_val = np.array(val[num_chunks*chunk_size:len(val)]) + arr[num_chunks*chunk_size:] = remainder_val + else: + arr[:] = np.array(val[0:len(val)]) + def write_data_hdf5(data, data_file): """ Write data into a HDF5 file. Parameters ---------- data : dict - The data to be saved to the Parquet file. + The data to be saved to the hdf5 file. data_file : str - The file name of the Parquet file. + The file name of the hdf5 file. """ with h5py.File(data_file, "w") as f: for key, val in data.items(): diff --git a/python/graphstorm/gconstruct/transform.py b/python/graphstorm/gconstruct/transform.py index 3575942308..2361cafbd9 100644 --- a/python/graphstorm/gconstruct/transform.py +++ b/python/graphstorm/gconstruct/transform.py @@ -442,6 +442,7 @@ def call(self, feats): class NumericalMinMaxTransform(TwoPhaseFeatTransform): """ Numerical value with Min-Max normalization. + $val = (val-min) / (max-min)$ Parameters @@ -451,17 +452,29 @@ class NumericalMinMaxTransform(TwoPhaseFeatTransform): feat_name : str The feature name used in the constructed graph. max_bound : float - The maximum float value. + The maximum float value. Any number larger than max_bound will be set to max_bound. min_bound : float - The minimum float value + The minimum float value. Any number smaller than min_bound will be set to min_bound. + max_val : list of float + Define the value of `max` in the Min-Max normalization formula for each feature. + If max_val is set, max_bound will be ignored. + min_val : list of float + Define the value of `min` in the Min-Max normalization formula for each feature. + If min_val is set, min_bound will be ignored. out_dtype: The dtype of the transformed feature. Default: None, we will not do data type casting. + transform_conf : dict + The configuration for the feature transformation. """ def __init__(self, col_name, feat_name, max_bound=sys.float_info.max, min_bound=-sys.float_info.max, - out_dtype=None): + max_val=None, min_val=None, + out_dtype=None, transform_conf=None): + self._max_val = np.array(max_val, dtype=np.float32) if max_val is not None else None + self._min_val = np.array(min_val, dtype=np.float32) if min_val is not None else None + self._conf = transform_conf self._max_bound = max_bound self._min_bound = min_bound out_dtype = np.float32 if out_dtype is None else out_dtype @@ -477,6 +490,12 @@ def pre_process(self, feats): """ assert isinstance(feats, (np.ndarray, ExtMemArrayWrapper)), \ "Feature of NumericalMinMaxTransform must be numpy array or ExtMemArray" + + # The max and min of $val = (val-min) / (max-min)$ is pre-defined + # in the transform_conf, return max_val and min_val directly + if self._max_val is not None and self._min_val is not None: + return {self.feat_name: (self._max_val, self._min_val)} + if isinstance(feats, ExtMemArrayWrapper): # TODO(xiangsx): This is not memory efficient. # It will load all data into main memory. @@ -493,15 +512,22 @@ def pre_process(self, feats): feats = feats.astype(np.float32) except: # pylint: disable=bare-except raise ValueError(f"The feature {self.feat_name} has to be integers or floats.") - assert len(feats.shape) <= 2, "Only support 1D fp feature or 2D fp feature" - max_val = np.amax(feats, axis=0) if len(feats.shape) == 2 \ - else np.array([np.amax(feats, axis=0)]) - min_val = np.amin(feats, axis=0) if len(feats.shape) == 2 \ - else np.array([np.amin(feats, axis=0)]) - max_val[max_val > self._max_bound] = self._max_bound - min_val[min_val < self._min_bound] = self._min_bound + if self._max_val is None: + max_val = np.amax(feats, axis=0) if len(feats.shape) == 2 \ + else np.array([np.amax(feats, axis=0)]) + max_val[max_val > self._max_bound] = self._max_bound + else: + max_val = self._max_val + + if self._min_val is None: + min_val = np.amin(feats, axis=0) if len(feats.shape) == 2 \ + else np.array([np.amin(feats, axis=0)]) + min_val[min_val < self._min_bound] = self._min_bound + else: + min_val = self._min_val + return {self.feat_name: (max_val, min_val)} def update_info(self, info): @@ -528,6 +554,11 @@ def update_info(self, info): self._max_val = max_val self._min_val = min_val + # We need to save the max_val and min_val in the config object. + if self._conf is not None: + self._conf['max_val'] = self._max_val.tolist() + self._conf['min_val'] = self._min_val.tolist() + def call(self, feats): """ Do normalization for feats @@ -878,11 +909,15 @@ def parse_feat_ops(confs): elif conf['name'] == 'max_min_norm': max_bound = conf['max_bound'] if 'max_bound' in conf else sys.float_info.max min_bound = conf['min_bound'] if 'min_bound' in conf else -sys.float_info.max + max_val = conf['max_val'] if 'max_val' in conf else None + min_val = conf['min_val'] if 'min_val' in conf else None transform = NumericalMinMaxTransform(feat['feature_col'], feat_name, max_bound, min_bound, - out_dtype=out_dtype) + max_val, + min_val, + out_dtype=out_dtype, transform_conf=conf) elif conf['name'] == 'rank_gauss': epsilon = conf['epsilon'] if 'epsilon' in conf else None transform = RankGaussTransform(feat['feature_col'], diff --git a/python/graphstorm/gsf.py b/python/graphstorm/gsf.py index 644bb380a9..f9d0acf8a3 100644 --- a/python/graphstorm/gsf.py +++ b/python/graphstorm/gsf.py @@ -81,16 +81,16 @@ class Options: # pylint: disable=missing-class-docstring Options.local_size) def initialize(ip_config, backend, use_wholegraph=False): - """ Initialize distributed inference context + """ Initialize distributed training and inference context. Parameters ---------- ip_config: str - File path of ip_config file + File path of ip_config file, e.g., `/tmp/ip_list.txt`. backend: str - Torch distributed backend + Torch distributed backend, e.g., ``gloo`` or ``nccl``. use_wholegraph: bool - Whether to use wholegraph for feature transfer + Whether to use wholegraph for feature transfer. """ # We need to use socket for communication in DGL 0.8. The tensorpipe backend has a bug. # This problem will be fixed in the future. @@ -116,6 +116,7 @@ def get_feat_size(g, node_feat_names): Returns ------- dict of int : the feature size for each node type. + """ feat_size = {} for ntype in g.ntypes: diff --git a/python/graphstorm/inference/ep_infer.py b/python/graphstorm/inference/ep_infer.py index 5804e9fcc2..771cdc7f4e 100644 --- a/python/graphstorm/inference/ep_infer.py +++ b/python/graphstorm/inference/ep_infer.py @@ -40,10 +40,11 @@ class GSgnnEdgePredictionInferrer(GSInferrer): """ def infer(self, loader, save_embed_path, save_prediction_path=None, - use_mini_batch_infer=False, # pylint: disable=unused-argument - node_id_mapping_file=None, - edge_id_mapping_file=None, - return_proba=True): + use_mini_batch_infer=False, # pylint: disable=unused-argument + node_id_mapping_file=None, + edge_id_mapping_file=None, + return_proba=True, + save_embed_format="pytorch"): """ Do inference The inference can do three things: @@ -67,6 +68,8 @@ def infer(self, loader, save_embed_path, save_prediction_path=None, graph partition algorithm. return_proba: bool Whether to return all the predictions or the maximum prediction. + save_embed_format : str + Specify the format of saved embeddings. """ do_eval = self.evaluator is not None if do_eval: @@ -74,6 +77,10 @@ def infer(self, loader, save_embed_path, save_prediction_path=None, "A label field must be provided for edge classification " \ "or regression inference when evaluation is required." + if use_mini_batch_infer: + assert save_embed_path is None, \ + "Unable to save the node embeddings when using mini batch inference." \ + "It is not guaranteed that mini-batch prediction will cover all the nodes." sys_tracker.check('start inferencing') self._model.eval() @@ -120,10 +127,12 @@ def infer(self, loader, save_embed_path, save_prediction_path=None, # The order of the ntypes must be sorted embs = {ntype: embs[ntype] for ntype in sorted(target_ntypes)} save_gsgnn_embeddings(save_embed_path, embs, get_rank(), - get_world_size(), device=device, - node_id_mapping_file=node_id_mapping_file) - barrier() - sys_tracker.check('save embeddings') + get_world_size(), + device=device, + node_id_mapping_file=node_id_mapping_file, + save_embed_format=save_embed_format) + barrier() + sys_tracker.check('save embeddings') if save_prediction_path is not None: if edge_id_mapping_file is not None: diff --git a/python/graphstorm/inference/lp_infer.py b/python/graphstorm/inference/lp_infer.py index d82d485baf..c43bd0ab40 100644 --- a/python/graphstorm/inference/lp_infer.py +++ b/python/graphstorm/inference/lp_infer.py @@ -40,9 +40,10 @@ class GSgnnLinkPredictionInferrer(GSInferrer): # TODO(zhengda) We only support full-graph inference for now. def infer(self, data, loader, save_embed_path, - edge_mask_for_gnn_embeddings='train_mask', - use_mini_batch_infer=False, - node_id_mapping_file=None): + edge_mask_for_gnn_embeddings='train_mask', + use_mini_batch_infer=False, + node_id_mapping_file=None, + save_embed_format="pytorch"): """ Do inference The inference can do two things: @@ -67,6 +68,8 @@ def infer(self, data, loader, save_embed_path, node_id_mapping_file: str Path to the file storing node id mapping generated by the graph partition algorithm. + save_embed_format : str + Specify the format of saved embeddings. """ sys_tracker.check('start inferencing') self._model.eval() @@ -82,10 +85,12 @@ def infer(self, data, loader, save_embed_path, device = self.device if save_embed_path is not None: save_gsgnn_embeddings(save_embed_path, embs, get_rank(), - get_world_size(), device=device, - node_id_mapping_file=node_id_mapping_file) - barrier() - sys_tracker.check('save embeddings') + get_world_size(), + device=device, + node_id_mapping_file=node_id_mapping_file, + save_embed_format=save_embed_format) + barrier() + sys_tracker.check('save embeddings') if self.evaluator is not None: test_start = time.time() diff --git a/python/graphstorm/inference/np_infer.py b/python/graphstorm/inference/np_infer.py index 344d181225..0204b1a002 100644 --- a/python/graphstorm/inference/np_infer.py +++ b/python/graphstorm/inference/np_infer.py @@ -43,7 +43,8 @@ class GSgnnNodePredictionInferrer(GSInferrer): def infer(self, loader, save_embed_path, save_prediction_path=None, use_mini_batch_infer=False, node_id_mapping_file=None, - return_proba=True): + return_proba=True, + save_embed_format="pytorch"): """ Do inference The inference does three things: @@ -67,6 +68,8 @@ def infer(self, loader, save_embed_path, save_prediction_path=None, graph partition algorithm. return_proba: bool Whether to return all the predictions or the maximum prediction. + save_embed_format : str + Specify the format of saved embeddings. """ do_eval = self.evaluator is not None if do_eval: @@ -134,9 +137,11 @@ def infer(self, loader, save_embed_path, save_prediction_path=None, ntype_emb = embs[ntype] embeddings = {ntype: ntype_emb} - save_gsgnn_embeddings(save_embed_path, embeddings, - get_rank(), get_world_size(), device=device, - node_id_mapping_file=node_id_mapping_file) + save_gsgnn_embeddings(save_embed_path, embeddings, get_rank(), + get_world_size(), + device=device, + node_id_mapping_file=node_id_mapping_file, + save_embed_format=save_embed_format) barrier() sys_tracker.check('save embeddings') diff --git a/python/graphstorm/model/edge_gnn.py b/python/graphstorm/model/edge_gnn.py index ce0fea7cfd..d9da341547 100644 --- a/python/graphstorm/model/edge_gnn.py +++ b/python/graphstorm/model/edge_gnn.py @@ -89,8 +89,9 @@ def predict(self, blocks, target_edges, node_feats, edge_feats, is true otherwise return the maximum value. """ -class GSgnnEdgeModelBase(GSgnnModelBase, # pylint: disable=abstract-method - GSgnnEdgeModelInterface): +# pylint: disable=abstract-method +class GSgnnEdgeModelBase(GSgnnEdgeModelInterface, + GSgnnModelBase): """ The base class for edge-prediction GNN When a user wants to define an edge prediction GNN model and train the model @@ -216,7 +217,7 @@ def edge_mini_batch_gnn_predict(model, loader, return_proba=True, return_label=F for etype in target_edge_graph.canonical_etypes} edge_decoder_feats = data.get_edge_feats(input_edges, data.decoder_edge_feat, - target_edge_graph.device) + device) edge_decoder_feats = {etype: feat.to(th.float32) \ for etype, feat in edge_decoder_feats.items()} else: diff --git a/python/graphstorm/model/gnn.py b/python/graphstorm/model/gnn.py index fb5c354336..5f59f63c9b 100644 --- a/python/graphstorm/model/gnn.py +++ b/python/graphstorm/model/gnn.py @@ -160,7 +160,8 @@ def restore_dense_model(self, restore_model_path, To restore model parameters for a model with a node_input_encoder, a GNN layer and a decoder: - .. code:: + .. code:: python + # suppose we are going to load all layers. input_encoder = self.input_encoder gnn_model = self.gnn_model @@ -198,7 +199,8 @@ def restore_sparse_model(self, restore_model_path): -------- To load sparse model parameters for a node_input_encoder - .. code:: + .. code:: python + from graphstorm.model.utils import load_sparse_emb for ntype, sparse_emb in sparse_embeds.items(): @@ -222,7 +224,8 @@ def save_dense_model(self, model_path): Example: -------- - .. code:: + .. code:: python + # This function is only called by rank 0 input_encoder = self.input_encoder gnn_model = self.gnn_model @@ -259,18 +262,20 @@ def save_sparse_model(self, model_path): Step 1: Create a path to save the learnable node embeddings. - .. code:: - from graphstorm.model.util import create_sparse_emb_path + .. code:: python + from graphstorm.model.util import create_sparse_emb_path + for ntype, sparse_emb in sparse_embeds.items(): create_sparse_emb_path(model_path, ntype) # make sure rank 0 creates the folder and change permission first - Step 2: Save learnable node embeddings. - .. code:: - from graphstorm.model.utils import save_sparse_emb + .. code:: python + + from graphstorm.model.utils import save_sparse_emb + for ntype, sparse_emb in sparse_embeds.items(): save_sparse_emb(model_path, sparse_emb, ntype) @@ -291,9 +296,11 @@ def restore_model(self, restore_model_path, model_layer_to_load=None): -------- Load a model from "/tmp/checkpoints". - .. code:: + .. code:: python + # CustomGSgnnModel is a child class of GSgnnModelBase model = CustomGSgnnModel() + # Restore model parameters from "/tmp/checkpoints" model.restore_model("/tmp/checkpoints") @@ -308,7 +315,8 @@ def restore_model(self, restore_model_path, model_layer_to_load=None): start_load_t = time.time() # Restore the model weights from a checkpoint saved previously. if restore_model_path is not None: - logging.debug('load model from %s', restore_model_path) + if get_rank() == 0: + logging.debug('load model from %s', restore_model_path) self.restore_dense_model(restore_model_path, model_layer_to_load) # If a user doesn't specify the layer to load, @@ -316,7 +324,8 @@ def restore_model(self, restore_model_path, model_layer_to_load=None): if model_layer_to_load is None \ or GRAPHSTORM_MODEL_EMBED_LAYER in model_layer_to_load \ or GRAPHSTORM_MODEL_SPARSE_EMBED_LAYER in model_layer_to_load: - logging.debug('Load Sparse embedding from %s', restore_model_path) + if get_rank() == 0: + logging.debug('Load Sparse embedding from %s', restore_model_path) self.restore_sparse_model(restore_model_path) # We need to make sure that the sparse embedding is completely loaded @@ -336,9 +345,11 @@ def save_model(self, model_path): -------- Save a model into "/tmp/checkpoints". - .. code:: + .. code:: python + # CustomGSgnnModel is a child class of GSgnnModelBase model = CustomGSgnnModel() + # Model parameters will be saved into "/tmp/checkpoints" model.save_model("/tmp/checkpoints") @@ -377,12 +388,19 @@ def create_optimizer(self): optimizers. Example: + Case 1: if there is only one optimizer: + + .. code:: python + def create_optimizer(self): # define torch.optim.Optimizer return optimizer Case 2: if there are both dense and sparse optimizers: + + .. code:: python + def create_optimizer(self): dense = [dense_opt] # define torch.optim.Optimizer sparse = [sparse_opt] # define dgl sparse Optimizer diff --git a/python/graphstorm/model/lp_gnn.py b/python/graphstorm/model/lp_gnn.py index 60ea6dbfdd..5e29e6fa82 100644 --- a/python/graphstorm/model/lp_gnn.py +++ b/python/graphstorm/model/lp_gnn.py @@ -54,8 +54,9 @@ def forward(self, blocks, pos_graph, neg_graph, The loss of prediction. """ -class GSgnnLinkPredictionModelBase(GSgnnModelBase, # pylint: disable=abstract-method - GSgnnLinkPredictionModelInterface): +# pylint: disable=abstract-method +class GSgnnLinkPredictionModelBase(GSgnnLinkPredictionModelInterface, + GSgnnModelBase): """ The base class for link-prediction GNN When a user wants to define a link prediction GNN model and train the model diff --git a/python/graphstorm/model/node_gnn.py b/python/graphstorm/model/node_gnn.py index 56504e1186..3e749756f7 100644 --- a/python/graphstorm/model/node_gnn.py +++ b/python/graphstorm/model/node_gnn.py @@ -80,8 +80,9 @@ def predict(self, blocks, node_feats, edge_feats, input_nodes, return_proba): The GNN embeddings. """ -class GSgnnNodeModelBase(GSgnnModelBase, # pylint: disable=abstract-method - GSgnnNodeModelInterface): +# pylint: disable=abstract-method +class GSgnnNodeModelBase(GSgnnNodeModelInterface, + GSgnnModelBase): """ The base class for node-prediction GNN When a user wants to define a node prediction GNN model and train the model diff --git a/python/graphstorm/model/rgat_encoder.py b/python/graphstorm/model/rgat_encoder.py index a6c21880f5..118d372bee 100644 --- a/python/graphstorm/model/rgat_encoder.py +++ b/python/graphstorm/model/rgat_encoder.py @@ -176,6 +176,7 @@ class RelationalGATEncoder(GraphConvEncoder): r"""Relational graph attention encoder Parameters + ----------- g : DGLHeteroGraph Input graph. h_dim: int diff --git a/python/graphstorm/model/utils.py b/python/graphstorm/model/utils.py index ff2458d3ab..304eb4fa90 100644 --- a/python/graphstorm/model/utils.py +++ b/python/graphstorm/model/utils.py @@ -27,8 +27,29 @@ from torch.nn.parallel import DistributedDataParallel import dgl -from ..utils import get_rank, get_world_size, barrier -from ..data.utils import alltoallv_cpu +from ..gconstruct.file_io import stream_dist_tensors_to_hdf5 +from ..utils import get_rank, barrier, get_world_size +from ..data.utils import alltoallv_cpu, alltoallv_nccl + +def pad_file_index(file_index, width=5): + """ Left pad file_index with zerros. + + for examaple, given 1, it will return 00001. + + Parameters + ---------- + file_index: int + Index of the file + width: int + Minimum length of resulting string; strings with length less + than width be prepended with 0 characters. + + Return + ------ + str: padded file_index + """ + assert width > 1, "Width should be larger than 1" + return str(file_index).zfill(width) def sparse_emb_initializer(emb): """ Initialize sparse embedding @@ -178,7 +199,7 @@ def save_sparse_emb(model_path, sparse_emb, ntype): # the create_sparse_embeds_path() method first before calling save_sparse_embeds(). emb_path = os.path.join(model_path, ntype) os.makedirs(emb_path, exist_ok=True) - emb_file_path = os.path.join(emb_path, f'sparse_emb_{rank}.pt') + emb_file_path = os.path.join(emb_path, f'sparse_emb_{pad_file_index(rank)}.pt') th.save(embs, emb_file_path) def save_sparse_embeds(model_path, embed_layer): @@ -312,6 +333,9 @@ def _exchange_node_id_mapping(rank, world_size, device, node_id_mapping_file: str Path to the file storing node id mapping generated by the graph partition algorithm. + + Return: + Tensor: sub node_id_mappings corresponding to `rank` """ backend = th.distributed.get_backend() device = th.device('cpu') if backend == "gloo" else device @@ -340,12 +364,116 @@ def _exchange_node_id_mapping(rank, world_size, device, if backend == "gloo": alltoallv_cpu(rank, world_size, gather_list, data_tensors) else: # backend == "nccl" - th.distributed.all_to_all(gather_list, data_tensors) - return gather_list[0] + alltoallv_nccl(gather_list, data_tensors) + # move mapping into CPU + return gather_list[0].to(th.device("cpu")) -def save_embeddings(model_path, embeddings, rank, world_size, - device=th.device('cpu'), node_id_mapping_file=None): - """ Save embeddings in a distributed way +def distribute_nid_map(embeddings, rank, world_size, + node_id_mapping_file, device=th.device('cpu')): + """ Distribute nid_map to all workers. + + Parameters + ---------- + embeddings : DistTensor + Embeddings to save + rank : int + Local rank + world_size : int + World size in a distributed env. + node_id_mapping_file: str + Path to the file storing node id mapping generated by the + graph partition algorithm. + device: torch device + Device used for all_to_allv data exchange. For gloo backend + we store data in CPU, For nccl backend, we need to store + data in GPU. + + Returns + _______ + Dict of list: Embeddings index from original order. + """ + assert node_id_mapping_file is not None + if isinstance(embeddings, (dgl.distributed.DistTensor, LazyDistTensor)): + # only host 0 will load node id mapping from disk + if rank == 0: + ori_node_id_mapping = th.load(node_id_mapping_file) + _, node_id_mapping = th.sort(ori_node_id_mapping) + else: + node_id_mapping = None + + nid_mapping = _exchange_node_id_mapping( + rank, world_size, device, node_id_mapping, len(embeddings)) + elif isinstance(embeddings, dict): + nid_mapping = {} + # only host 0 will load node id mapping from disk + node_id_mappings = th.load(node_id_mapping_file) \ + if rank == 0 else None + + for name, emb in embeddings.items(): + if rank == 0: + assert name in node_id_mappings, \ + f"node id mapping for ntype {name} should exists" + # new mapping back index + ori_node_id_mapping = node_id_mappings[name] + _, node_id_mapping = th.sort(ori_node_id_mapping) + else: + node_id_mapping = None + + nid_mapping[name] = _exchange_node_id_mapping( + rank, world_size, device, node_id_mapping, len(emb)) + else: + nid_mapping = None + return nid_mapping + +def remap_embeddings(embeddings, rank, world_size, + node_id_mapping_file, device=th.device('cpu')): + """ Remap embeddings by nid_map without writing to disk. + + Parameters + ---------- + embeddings : DistTensor + Embeddings to save + rank : int + Local rank + world_size : int + World size in a distributed env. + node_id_mapping_file: str + Path to the file storing node id mapping generated by the + graph partition algorithm. + device: torch device + Device used for all_to_allv data exchange. For gloo backend + we store data in CPU, For nccl backend, we need to store + data in GPU. + + Returns + _______ + DistTensor : remapped DistTensor + """ + assert node_id_mapping_file is not None + + # TODO: handle when node_id_mapping_file is None. + nid_mapping = distribute_nid_map(embeddings, rank, world_size, + node_id_mapping_file, device) + + if isinstance(embeddings, (dgl.distributed.DistTensor, LazyDistTensor)): + start, end = _get_data_range(rank, world_size, len(embeddings)) + embeddings[list(range(start, end))] = embeddings[nid_mapping] + elif isinstance(embeddings, dict): + # We need to duplicate the dict so that the input argument is not changed. + embeddings = dict(embeddings.items()) + for name, emb in embeddings.items(): + if isinstance(emb, (dgl.distributed.DistTensor, LazyDistTensor)): + # this is the same window as nid_mapping + start, end = _get_data_range(rank, world_size, len(emb)) + # we need to keep emb to be dist tensor unchanged + emb[th.arange(start, end)] = emb[nid_mapping[name]] + th.distributed.barrier() + + return embeddings + +def save_pytorch_embeddings(model_path, embeddings, rank, world_size, + device=th.device('cpu'), node_id_mapping_file=None): + """ Save embeddings through pytorch a distributed way Parameters ---------- @@ -353,11 +481,10 @@ def save_embeddings(model_path, embeddings, rank, world_size, The path of the folder where the model is saved. embeddings : DistTensor Embeddings to save - rank: int + rank : int Rank of the current process in a distributed environment. world_size : int - World size in a distributed environment. This tells the size of a distributed cluster - (How many processes in a cluster). + World size in a distributed env. device: torch device Device used for all_to_allv data exchange. For gloo backend we store data in CPU, For nccl backend, we need to store @@ -366,7 +493,6 @@ def save_embeddings(model_path, embeddings, rank, world_size, Path to the file storing node id mapping generated by the graph partition algorithm. """ - os.makedirs(model_path, exist_ok=True) # [04/16]: Only rank 0 can chmod to let all other ranks to write files. if rank == 0: # mode 767 means rwx-rw-rwx: @@ -383,36 +509,8 @@ def save_embeddings(model_path, embeddings, rank, world_size, # less than 10 billion. An ID mapping of 10 billion nodes # will take around 80 GByte. if node_id_mapping_file is not None: - if isinstance(embeddings, (dgl.distributed.DistTensor, LazyDistTensor)): - # only host 0 will load node id mapping from disk - if rank == 0: - ori_node_id_mapping = th.load(node_id_mapping_file) - _, node_id_mapping = th.sort(ori_node_id_mapping) - else: - node_id_mapping = None - - nid_mapping = _exchange_node_id_mapping( - rank, world_size, device, node_id_mapping, len(embeddings)) - elif isinstance(embeddings, dict): - nid_mapping = {} - # only host 0 will load node id mapping from disk - node_id_mappings = th.load(node_id_mapping_file) \ - if rank == 0 else None - - for name, emb in embeddings.items(): - if rank == 0: - assert name in node_id_mappings, \ - f"node id mapping for ntype {name} should exists" - # new mapping back index - ori_node_id_mapping = node_id_mappings[name] - _, node_id_mapping = th.sort(ori_node_id_mapping) - else: - node_id_mapping = None - - nid_mapping[name] = _exchange_node_id_mapping( - rank, world_size, device, node_id_mapping, len(emb)) - else: - nid_mapping = None + nid_mapping = distribute_nid_map(embeddings, rank, world_size, + node_id_mapping_file, device) else: nid_mapping = None @@ -442,16 +540,86 @@ def save_embeddings(model_path, embeddings, rank, world_size, if isinstance(embeddings, dict): # embedding per node type for name, emb in embeddings.items(): - th.save(emb, os.path.join(model_path, f'{name}_emb.part{rank}.bin')) + th.save(emb, os.path.join(model_path, f'{name}_emb.part{pad_file_index(rank)}.bin')) emb_info["emb_name"].append(name) else: - th.save(embeddings, os.path.join(model_path, f'emb.part{rank}.bin')) + th.save(embeddings, os.path.join(model_path, f'emb.part{pad_file_index(rank)}.bin')) emb_info["emb_name"] = None if rank == 0: with open(os.path.join(model_path, "emb_info.json"), 'w', encoding='utf-8') as f: f.write(json.dumps(emb_info)) +def save_hdf5_embeddings(model_path, embeddings, rank, world_size, + device=th.device('cpu'), node_id_mapping_file=None): + """ Save embeddings through hdf5 into a single file. + + Parameters + ---------- + model_path : str + The path of the folder where the model is saved. + embeddings : DistTensor + Embeddings to save + rank : int + Rank of the current process in a distributed environment. + world_size : int + World size in a distributed env. + device: torch device + Device used for all_to_allv data exchange. For gloo backend + we store data in CPU, For nccl backend, we need to store + data in GPU. + node_id_mapping_file: str + Path to the file storing node id mapping generated by the + graph partition algorithm. + """ + mapped_embeds = remap_embeddings(embeddings, rank, world_size, + node_id_mapping_file, device) + if rank == 0: + stream_dist_tensors_to_hdf5(mapped_embeds, os.path.join(model_path, "embed_dict.hdf5")) + +def save_embeddings(model_path, embeddings, rank, world_size, + device=th.device('cpu'), node_id_mapping_file=None, + save_embed_format="pytorch"): + """ Save embeddings. + + Parameters + ---------- + model_path : str + The path of the folder where the model is saved. + embeddings : DistTensor + Embeddings to save + rank : int + Rank of the current process in a distributed environment. + world_size : int + World size in a distributed env. + device: torch device + Device used for all_to_allv data exchange. For gloo backend + we store data in CPU, For nccl backend, we need to store + data in GPU. + node_id_mapping_file : str + Path to the file storing node id mapping generated by the + graph partition algorithm. + save_embed_format : str + The format of saved embeddings. + Currently support ["pytorch", "hdf5"]. + """ + os.makedirs(model_path, exist_ok=True) + if save_embed_format == "pytorch": + if rank == 0: + logging.info("Writing GNN embeddings to "\ + "%s in pytorch format.", model_path) + save_pytorch_embeddings(model_path, embeddings, rank, world_size, + device, node_id_mapping_file) + elif save_embed_format == "hdf5": + if rank == 0: + logging.info("Writing GNN embeddings to "\ + "%s in hdf5 format.", \ + os.path.join(model_path, 'embed_dict.hdf5')) + save_hdf5_embeddings(model_path, embeddings, rank, world_size, + device, node_id_mapping_file) + else: + raise ValueError(f"{model_path} is not supported for save_embed_format") + def shuffle_predict(predictions, id_mapping_file, pred_type, rank, world_size, device): """ Shuffle prediction result according to id_mapping @@ -511,7 +679,7 @@ def save_prediction_results(predictions, prediction_path, rank): # make sure the prediction_path permission is changed before other process start to save barrier() - th.save(predictions, os.path.join(prediction_path, "predict-{}.pt".format(rank))) + th.save(predictions, os.path.join(prediction_path, f"predict-{pad_file_index(rank)}.pt")) def load_model(model_path, gnn_model=None, embed_layer=None, decoder=None): """ Load a complete gnn model. @@ -596,7 +764,8 @@ def load_sparse_emb(target_sparse_emb, ntype_emb_path): for i in range(math.ceil(num_files/world_size)): file_idx = i * world_size + rank if file_idx < num_files: - emb = th.load(os.path.join(ntype_emb_path, f'sparse_emb_{file_idx}.pt')) + emb = th.load(os.path.join(ntype_emb_path, + f'sparse_emb_{pad_file_index(file_idx)}.pt')) # Get the target idx range for sparse_emb_{rank}.pt start, end = _get_sparse_emb_range(num_embs, diff --git a/python/graphstorm/run/gsgnn_ep/ep_infer_gnn.py b/python/graphstorm/run/gsgnn_ep/ep_infer_gnn.py index f43492ed44..cf2f7b622a 100644 --- a/python/graphstorm/run/gsgnn_ep/ep_infer_gnn.py +++ b/python/graphstorm/run/gsgnn_ep/ep_infer_gnn.py @@ -53,7 +53,8 @@ def main(config_args): label_field=config.label_field, decoder_edge_feat=config.decoder_edge_feat) model = gs.create_builtin_edge_gnn_model(infer_data.g, config, train_task=False) - model.restore_model(config.restore_model_path) + model.restore_model(config.restore_model_path, + model_layer_to_load=config.restore_model_layers) # TODO(zhengda) we should use a different way to get rank. infer = GSgnnEdgePredictionInferrer(model) infer.setup_device(device=device) @@ -80,16 +81,13 @@ def main(config_args): remove_target_edge_type=config.remove_target_edge_type, construct_feat_ntype=config.construct_feat_ntype, construct_feat_fanout=config.construct_feat_fanout) - # Preparing input layer for training or inference. - # The input layer can pre-compute node features in the preparing step if needed. - # For example pre-compute all BERT embeddings - model.prepare_input_encoder(infer_data) infer.infer(dataloader, save_embed_path=config.save_embed_path, save_prediction_path=config.save_prediction_path, use_mini_batch_infer=config.use_mini_batch_infer, node_id_mapping_file=config.node_id_mapping_file, edge_id_mapping_file=config.edge_id_mapping_file, - return_proba=config.return_proba) + return_proba=config.return_proba, + save_embed_format=config.save_embed_format) def generate_parser(): """ Generate an argument parser diff --git a/python/graphstorm/run/gsgnn_ep/ep_infer_lm.py b/python/graphstorm/run/gsgnn_ep/ep_infer_lm.py index f6b34442b0..7e43ff18cb 100644 --- a/python/graphstorm/run/gsgnn_ep/ep_infer_lm.py +++ b/python/graphstorm/run/gsgnn_ep/ep_infer_lm.py @@ -54,7 +54,8 @@ def main(config_args): label_field=config.label_field, decoder_edge_feat=config.decoder_edge_feat) model = gs.create_builtin_edge_model(infer_data.g, config, train_task=False) - model.restore_model(config.restore_model_path) + model.restore_model(config.restore_model_path, + model_layer_to_load=config.restore_model_layers) infer = GSgnnEdgePredictionInferrer(model) infer.setup_device(device=device) if not config.no_validation: @@ -76,7 +77,8 @@ def main(config_args): use_mini_batch_infer=config.use_mini_batch_infer, node_id_mapping_file=config.node_id_mapping_file, edge_id_mapping_file=config.edge_id_mapping_file, - return_proba=config.return_proba) + return_proba=config.return_proba, + save_embed_format=config.save_embed_format) def generate_parser(): """ Generate an argument parser diff --git a/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py b/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py index c80cf66d23..e263011287 100644 --- a/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py +++ b/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py @@ -43,7 +43,8 @@ def main(config_args): node_feat_field=config.node_feat_name, decoder_edge_feat=config.decoder_edge_feat) model = gs.create_builtin_lp_gnn_model(infer_data.g, config, train_task=False) - model.restore_model(config.restore_model_path) + model.restore_model(config.restore_model_path, + model_layer_to_load=config.restore_model_layers) infer = GSgnnLinkPredictionInferrer(model) infer.setup_device(device=device) if not config.no_validation: @@ -69,16 +70,13 @@ def main(config_args): batch_size=config.eval_batch_size, num_negative_edges=config.num_negative_edges_eval, fanout=config.eval_fanout) - # Preparing input layer for training or inference. - # The input layer can pre-compute node features in the preparing step if needed. - # For example pre-compute all BERT embeddings - model.prepare_input_encoder(infer_data) infer.infer(infer_data, dataloader, save_embed_path=config.save_embed_path, edge_mask_for_gnn_embeddings=None if config.no_validation else \ 'train_mask', # if no validation,any edge can be used in message passing. use_mini_batch_infer=config.use_mini_batch_infer, - node_id_mapping_file=config.node_id_mapping_file) + node_id_mapping_file=config.node_id_mapping_file, + save_embed_format=config.save_embed_format) def generate_parser(): """ Generate an argument parser diff --git a/python/graphstorm/run/gsgnn_lp/lp_infer_lm.py b/python/graphstorm/run/gsgnn_lp/lp_infer_lm.py index 4150a6a9af..e196d3fd83 100644 --- a/python/graphstorm/run/gsgnn_lp/lp_infer_lm.py +++ b/python/graphstorm/run/gsgnn_lp/lp_infer_lm.py @@ -44,7 +44,8 @@ def main(config_args): node_feat_field=config.node_feat_name, decoder_edge_feat=config.decoder_edge_feat) model = gs.create_builtin_lp_model(infer_data.g, config, train_task=False) - model.restore_model(config.restore_model_path) + model.restore_model(config.restore_model_path, + model_layer_to_load=config.restore_model_layers) infer = GSgnnLinkPredictionInferrer(model) infer.setup_device(device=device) if not config.no_validation: @@ -77,7 +78,8 @@ def main(config_args): save_embed_path=config.save_embed_path, edge_mask_for_gnn_embeddings=None, # LM infer does not use GNN use_mini_batch_infer=config.use_mini_batch_infer, - node_id_mapping_file=config.node_id_mapping_file) + node_id_mapping_file=config.node_id_mapping_file, + save_embed_format=config.save_embed_format) def generate_parser(): """ Generate an argument parser diff --git a/python/graphstorm/run/gsgnn_np/np_infer_gnn.py b/python/graphstorm/run/gsgnn_np/np_infer_gnn.py index d8911ccfd7..e5cc4e629d 100644 --- a/python/graphstorm/run/gsgnn_np/np_infer_gnn.py +++ b/python/graphstorm/run/gsgnn_np/np_infer_gnn.py @@ -15,7 +15,6 @@ Inference script for node classification/regression tasks with GNN """ - import graphstorm as gs from graphstorm.config import get_argument_parser from graphstorm.config import GSConfig @@ -52,7 +51,8 @@ def main(config_args): node_feat_field=config.node_feat_name, label_field=config.label_field) model = gs.create_builtin_node_gnn_model(infer_data.g, config, train_task=False) - model.restore_model(config.restore_model_path) + model.restore_model(config.restore_model_path, + model_layer_to_load=config.restore_model_layers) infer = GSgnnNodePredictionInferrer(model) infer.setup_device(device=device) if not config.no_validation: @@ -76,15 +76,12 @@ def main(config_args): train_task=False, construct_feat_ntype=config.construct_feat_ntype, construct_feat_fanout=config.construct_feat_fanout) - # Preparing input layer for training or inference. - # The input layer can pre-compute node features in the preparing step if needed. - # For example pre-compute all BERT embeddings - model.prepare_input_encoder(infer_data) infer.infer(dataloader, save_embed_path=config.save_embed_path, save_prediction_path=config.save_prediction_path, use_mini_batch_infer=config.use_mini_batch_infer, node_id_mapping_file=config.node_id_mapping_file, - return_proba=config.return_proba) + return_proba=config.return_proba, + save_embed_format=config.save_embed_format) def generate_parser(): """ Generate an argument parser diff --git a/python/graphstorm/run/launch.py b/python/graphstorm/run/launch.py index ce089c6bc3..b23d19a263 100644 --- a/python/graphstorm/run/launch.py +++ b/python/graphstorm/run/launch.py @@ -591,9 +591,6 @@ def update_udf_command(udf_command, args): udf_command.append("--part-config") udf_command.append(args.part_config) - udf_command.append("--verbose") - udf_command.append(str(args.verbose)) - return udf_command def get_available_port(ip): @@ -755,9 +752,7 @@ def submit_jobs(args, udf_command): cmd, state_q, ip, args.ssh_port, username=args.ssh_username ) ) - - if args.verbose: - logging.debug(torch_dist_udf_command) + logging.debug(torch_dist_udf_command) # Start a cleanup process dedicated for cleaning up remote training jobs. conn1, conn2 = multiprocessing.Pipe() @@ -804,11 +799,12 @@ def get_argument_parser(): then the ssh command will be like: 'ssh bob@1.2.3.4 CMD' " "instead of 'ssh 1.2.3.4 CMD'", ) + # We should deprecate it. parser.add_argument( "--verbose", type=lambda x: (str(x).lower() in ['true', '1']), default=False, - help="Print more information.", + help="Print more information. This argument is deprecated and is no longer effective.", ) parser.add_argument( "--workspace", @@ -935,9 +931,8 @@ def check_input_arguments(args): args.num_omp_threads = max( cpu_cores_per_trainer, 1 ) - if args.verbose: - logging.debug("The number of OMP threads per trainer is set to %d", - args.num_omp_threads) + logging.debug("The number of OMP threads per trainer is set to %d", + args.num_omp_threads) else: assert args.num_omp_threads > 0, \ "The number of OMP threads per trainer should be larger than 0" @@ -949,9 +944,8 @@ def check_input_arguments(args): args.num_server_threads = max( cpu_cores_per_server, 1 ) - if args.verbose: - logging.debug("The number of OMP threads per server is set to %d", - args.num_server_threads) + logging.debug("The number of OMP threads per server is set to %d", + args.num_server_threads) else: assert args.num_server_threads > 0, \ "The number of OMP threads per server should be larger than 1" diff --git a/python/graphstorm/trainer/ep_trainer.py b/python/graphstorm/trainer/ep_trainer.py index adb1cf9d62..a7de98bac1 100644 --- a/python/graphstorm/trainer/ep_trainer.py +++ b/python/graphstorm/trainer/ep_trainer.py @@ -142,7 +142,7 @@ def fit(self, train_loader, num_epochs, for etype in batch_graph.canonical_etypes} edge_decoder_feats = data.get_edge_feats(input_edges, data.decoder_edge_feat, - batch_graph.device) + device) edge_decoder_feats = {etype: feat.to(th.float32) \ for etype, feat in edge_decoder_feats.items()} else: @@ -283,6 +283,19 @@ def eval(self, model, val_loader, test_loader, use_mini_batch_infer, total_steps """ test_start = time.time() sys_tracker.check('start prediction') + + metric = set(self.evaluator.metric) + need_proba = metric.intersection({'roc_auc', 'per_class_roc_auc', 'precision_recall'}) + need_label_pred = metric.intersection({'accuracy', 'f1_score', 'per_class_f1_score'}) + assert len(need_proba) == 0 or len(need_label_pred) == 0, \ + f"{need_proba} requires return_proba==True, \ + but {need_label_pred} requires return_proba==False." + if len(need_proba) > 0 and return_proba is False: + return_proba = True + logging.warning("%s requires return_proba==True. \ + Set return_proba to True.", need_proba) + + model.eval() if use_mini_batch_infer: val_pred, val_label = edge_mini_batch_gnn_predict(model, val_loader, return_proba, diff --git a/python/graphstorm/trainer/np_trainer.py b/python/graphstorm/trainer/np_trainer.py index 46323d634c..a7dccf8532 100644 --- a/python/graphstorm/trainer/np_trainer.py +++ b/python/graphstorm/trainer/np_trainer.py @@ -261,6 +261,18 @@ def eval(self, model, val_loader, test_loader, use_mini_batch_infer, total_steps """ teval = time.time() sys_tracker.check('before prediction') + + metric = set(self.evaluator.metric) + need_proba = metric.intersection({'roc_auc', 'per_class_roc_auc', 'precision_recall'}) + need_label_pred = metric.intersection({'accuracy', 'f1_score', 'per_class_f1_score'}) + assert len(need_proba) == 0 or len(need_label_pred) == 0, \ + f"{need_proba} requires return_proba==True, \ + but {need_label_pred} requires return_proba==False." + if len(need_proba) > 0 and return_proba is False: + return_proba = True + logging.warning("%s requires return_proba==True. \ + Set return_proba to True.", need_proba) + if use_mini_batch_infer: val_pred, _, val_label = node_mini_batch_gnn_predict(model, val_loader, return_proba, return_label=True) diff --git a/python/graphstorm/utils.py b/python/graphstorm/utils.py index c5c13a0145..4d0f625480 100644 --- a/python/graphstorm/utils.py +++ b/python/graphstorm/utils.py @@ -30,7 +30,16 @@ TORCH_MAJOR_VER = int(th.__version__.split('.', maxsplit=1)[0]) def setup_device(local_rank): - """Setup computation device + r"""Setup computation device. + + Parameters + ----------- + local_rank: int + Rank of the current process in a distributed environment. + + Returns + ------- + str: device where the model runs. """ if th.cuda.is_available(): assert local_rank < th.cuda.device_count(), \ diff --git a/tests/end2end-tests/data_process/compare_graphs.py b/tests/end2end-tests/data_process/compare_graphs.py index d44025ad4a..5d02a9d857 100644 --- a/tests/end2end-tests/data_process/compare_graphs.py +++ b/tests/end2end-tests/data_process/compare_graphs.py @@ -19,6 +19,8 @@ import dgl import numpy as np +from numpy.testing import assert_almost_equal + argparser = argparse.ArgumentParser("Compare graphs") argparser.add_argument("--graph-path1", type=str, required=True, help="The path of the constructed graph.") @@ -35,11 +37,14 @@ for name in g1.nodes[ntype].data: # We should skip '*_mask' because data split is split randomly. if 'mask' not in name: - assert np.all(g1.nodes[ntype].data[name].numpy() == g2.nodes[ntype].data[name].numpy()) + assert_almost_equal(g1.nodes[ntype].data[name].numpy(), + g2.nodes[ntype].data[name].numpy()) + for etype in g1.canonical_etypes: assert g1.number_of_edges(etype) == g2.number_of_edges(etype) for name in g1.edges[etype].data: # We should skip '*_mask' because data split is split randomly. if 'mask' not in name: - assert np.all(g1.edges[etype].data[name].numpy() == g2.edges[etype].data[name].numpy()) + assert_almost_equal(g1.edges[etype].data[name].numpy(), + g2.edges[etype].data[name].numpy()) diff --git a/tests/end2end-tests/data_process/data_gen.py b/tests/end2end-tests/data_process/data_gen.py index 4c8525e639..abab89ea3d 100644 --- a/tests/end2end-tests/data_process/data_gen.py +++ b/tests/end2end-tests/data_process/data_gen.py @@ -41,6 +41,7 @@ def gen_rand_nid(max_nid, num_nodes): return gen_rand_nid(max_nid, num_nodes) return node_ids +np.random.seed(1) node_id1 = gen_rand_nid(1000000000, 10000) node_text = np.array([str(nid) for nid in node_id1]) node_data1 = { diff --git a/tests/end2end-tests/graphstorm-ec/mgpu_test.sh b/tests/end2end-tests/graphstorm-ec/mgpu_test.sh index e702620cff..71928c453a 100644 --- a/tests/end2end-tests/graphstorm-ec/mgpu_test.sh +++ b/tests/end2end-tests/graphstorm-ec/mgpu_test.sh @@ -173,3 +173,10 @@ echo "**************dataset: Generated multilabel MovieLens EC, RGCN layer: 1, n python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_scripts/gsgnn_ep/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_multi_label_ec/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec.yaml --exclude-training-targets True --multilabel true --num-classes 6 --node-feat-name movie:title user:feat --use-mini-batch-infer false --num-epochs 1 --backend nccl error_and_exit $? + +echo "**************dataset: Generated multilabel MovieLens EC, RGCN layer: 1, node feat: generated feature, inference: minibatch, exclude-training-targets: True, decoder edge feat: label" +python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_scripts/gsgnn_ep/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_multi_label_ec/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec.yaml --exclude-training-targets True --multilabel true --num-classes 6 --node-feat-name movie:title user:feat --use-mini-batch-infer true --topk-model-to-save 1 --save-embed-path /data/gsgnn_ec/emb/ --save-model-path /data/gsgnn_ec/ --save-model-frequency 1000 --decoder-edge-feat user,rating,movie:rate --fanout 'user/rating/movie:4@movie/rating-rev/user:5,user/rating/movie:2@movie/rating-rev/user:2' --num-layers 2 --decoder-type MLPEFeatEdgeDecoder + +error_and_exit $? +rm -fr /data/gsgnn_ec/* + diff --git a/tests/end2end-tests/graphstorm-ec/test.sh b/tests/end2end-tests/graphstorm-ec/test.sh index a915ea0515..2413940d16 100644 --- a/tests/end2end-tests/graphstorm-ec/test.sh +++ b/tests/end2end-tests/graphstorm-ec/test.sh @@ -79,8 +79,8 @@ python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_s error_and_exit $? # TODO(zhengda) Failure found during evaluation of the auc metric returning -1 multiclass format is not supported -echo "**************dataset: Test edge classification, RGCN layer: 1, node feat: fixed HF BERT, BERT nodes: movie, inference: mini-batch, eval_metric: precision_recall accuracy" -python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_scripts/gsgnn_ep/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_ec_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec.yaml --part-config /data/movielen_100k_ec_1p_4t/movie-lens-100k.json --eval-metric precision_recall accuracy --num-epochs 1 +echo "**************dataset: Test edge classification, RGCN layer: 1, node feat: fixed HF BERT, BERT nodes: movie, inference: mini-batch, eval_metric: accuracy" +python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_scripts/gsgnn_ep/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_ec_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec.yaml --part-config /data/movielen_100k_ec_1p_4t/movie-lens-100k.json --eval-metric precision_recall --num-epochs 1 error_and_exit $? diff --git a/tests/unit-tests/gconstruct/test_transform.py b/tests/unit-tests/gconstruct/test_transform.py index 9fb246b0f6..d11d7ca91e 100644 --- a/tests/unit-tests/gconstruct/test_transform.py +++ b/tests/unit-tests/gconstruct/test_transform.py @@ -112,7 +112,10 @@ def test_fp_transform(input_dtype): assert_equal(min_val[i], -5.) # Test collect info - transform = NumericalMinMaxTransform("test", "test") + transform_conf = { + "name": "max_min_norm" + } + transform = NumericalMinMaxTransform("test", "test", transform_conf=transform_conf) info = [(np.array([1.]), np.array([-1.])), (np.array([2.]), np.array([-0.5])), (np.array([0.5]), np.array([-0.1]))] @@ -121,6 +124,10 @@ def test_fp_transform(input_dtype): assert len(transform._min_val) == 1 assert_equal(transform._max_val[0], 2.) assert_equal(transform._min_val[0], -1.) + assert 'max_val' in transform_conf + assert 'min_val' in transform_conf + assert_equal(np.array(transform_conf['max_val']), 2.) + assert_equal(np.array(transform_conf['min_val']), -1.) info = [(np.array([1., 2., 3.]), np.array([-1., -2., 0.5])), (np.array([2., 1., 3.]), np.array([-0.5, -3., 0.1])), @@ -130,6 +137,84 @@ def test_fp_transform(input_dtype): assert len(transform._min_val) == 3 assert_equal(transform._max_val[0], 2.) assert_equal(transform._min_val[0], -1.) + assert 'max_val' in transform_conf + assert 'min_val' in transform_conf + assert_equal(np.array(transform_conf['max_val']), + np.array([2.,3.,3.])) + assert_equal(np.array(transform_conf['min_val']), + np.array([-1.,-3.,0.1])) + + transform_conf = { + "name": "max_min_norm", + "max_val": [1.,1.,1.], + "min_val": [-1.,-1.,-1.] + } + transform = NumericalMinMaxTransform("test", "test", + max_val=transform_conf['max_val'], + min_val=transform_conf['min_val'], + transform_conf=transform_conf) + feats = 2 * np.random.randn(10, 3).astype(input_dtype) + feats[0][0] = 2 + feats[0][1] = -2 + info = transform.pre_process(feats) + max_val = np.array(transform_conf['max_val']) + min_val = np.array(transform_conf["min_val"]) + assert_equal(info["test"][0], max_val) + assert_equal(info["test"][1], min_val) + transform.update_info([info["test"]]) + assert_equal(np.array(transform_conf['max_val']), + np.array([1.,1.,1.])) + assert_equal(np.array(transform_conf['min_val']), + np.array([-1.,-1.,-1.])) + result = transform(feats) + true_result = (feats - min_val) / (max_val - min_val) + true_result[true_result > 1] = 1 + true_result[true_result < 0] = 0 + assert_almost_equal(result["test"].astype(input_dtype), true_result) + + transform_conf = { + "name": "max_min_norm", + "min_val": [-1.,-1.,-1.] + } + transform = NumericalMinMaxTransform("test", "test", + min_val=transform_conf['min_val'], + transform_conf=transform_conf) + info = transform.pre_process(feats) + max_val = info["test"][0] + min_val = np.array(transform_conf['min_val']) + assert_equal(info["test"][0], max_val) + transform.update_info([info["test"]]) + assert_equal(np.array(transform_conf['max_val']), + max_val) + assert_equal(np.array(transform_conf['min_val']), + np.array([-1.,-1.,-1.])) + result = transform(feats) + true_result = (feats - min_val) / (max_val - min_val) + true_result[true_result > 1] = 1 + true_result[true_result < 0] = 0 + assert_almost_equal(result["test"].astype(input_dtype), true_result) + + transform_conf = { + "name": "max_min_norm", + "max_val": [1.,1.,1.] + } + transform = NumericalMinMaxTransform("test", "test", + max_val=transform_conf['max_val'], + transform_conf=transform_conf) + info = transform.pre_process(feats) + max_val = np.array(transform_conf['max_val']) + min_val = info["test"][1] + assert_equal(info["test"][0], max_val) + transform.update_info([info["test"]]) + assert_equal(np.array(transform_conf['max_val']), + np.array([1.,1.,1.])) + assert_equal(np.array(transform_conf['min_val']), + min_val) + result = transform(feats) + true_result = (feats - min_val) / (max_val - min_val) + true_result[true_result > 1] = 1 + true_result[true_result < 0] = 0 + assert_almost_equal(result["test"].astype(input_dtype), true_result) @pytest.mark.parametrize("input_dtype", [np.cfloat, np.float32]) @pytest.mark.parametrize("out_dtype", [None, np.float16]) diff --git a/tests/unit-tests/test_dataloading.py b/tests/unit-tests/test_dataloading.py index 5c31efe7c5..1f4d911c8d 100644 --- a/tests/unit-tests/test_dataloading.py +++ b/tests/unit-tests/test_dataloading.py @@ -178,8 +178,10 @@ def test_GSgnnEdgeData(): for etype in tr_etypes: assert th.all(tr_data1.test_idxs[etype] == get_nonzero(dist_graph.edges[etype[1]].data['test_mask'])) assert len(ev_data.test_idxs) == len(va_etypes) + assert len(ev_data.infer_idxs) == len(ev_data.test_idxs) for etype in va_etypes: assert th.all(ev_data.test_idxs[etype] == get_nonzero(dist_graph.edges[etype[1]].data['test_mask'])) + assert th.all(ev_data.infer_idxs[etype] == get_nonzero(dist_graph.edges[etype[1]].data['test_mask'])) # pass train etypes as None assert len(tr_data2.train_idxs) == len(dist_graph.canonical_etypes) @@ -201,8 +203,10 @@ def test_GSgnnEdgeData(): # pass eval etypes as None assert len(ev_data2.test_idxs) == 2 + assert len(ev_data2.infer_idxs) == 2 for etype in dist_graph.canonical_etypes: assert th.all(ev_data2.test_idxs[etype] == get_nonzero(dist_graph.edges[etype[1]].data['test_mask'])) + assert th.all(ev_data2.infer_idxs[etype] == get_nonzero(dist_graph.edges[etype[1]].data['test_mask'])) labels = tr_data.get_labels({('n0', 'r1', 'n1'): [0, 1]}) assert len(labels.keys()) == 1 @@ -273,8 +277,10 @@ def test_GSgnnNodeData(): for ntype in tr_ntypes: assert th.all(tr_data1.test_idxs[ntype] == get_nonzero(dist_graph.nodes[ntype].data['test_mask'])) assert len(ev_data.test_idxs) == len(va_ntypes) + assert len(ev_data.infer_idxs) == len(va_ntypes) for ntype in va_ntypes: assert th.all(ev_data.test_idxs[ntype] == get_nonzero(dist_graph.nodes[ntype].data['test_mask'])) + assert th.all(ev_data.infer_idxs[ntype] == get_nonzero(dist_graph.nodes[ntype].data['test_mask'])) labels = tr_data.get_labels({'n1': [0, 1]}) assert len(labels.keys()) == 1 diff --git a/tests/unit-tests/test_model_save_load.py b/tests/unit-tests/test_model_save_load.py index 543e180e11..7887c2c304 100644 --- a/tests/unit-tests/test_model_save_load.py +++ b/tests/unit-tests/test_model_save_load.py @@ -28,6 +28,7 @@ from graphstorm.model.utils import save_sparse_embeds from graphstorm.model.utils import load_sparse_embeds from graphstorm.model.utils import _get_sparse_emb_range +from graphstorm.model.utils import pad_file_index from graphstorm import get_feat_size from data_utils import generate_dummy_dist_graph @@ -100,7 +101,7 @@ def check_saved_sparse_emb(mock_get_world_size, mock_get_rank): for i in range(world_size): saved_embs.append(th.load( os.path.join(os.path.join(model_path, ntype), - f'sparse_emb_{i}.pt'))) + f'sparse_emb_{pad_file_index(i)}.pt'))) saved_embs = th.cat(saved_embs, dim=0) assert_equal(saved_embs.numpy(), sparse_embs[ntype].numpy()) check_saved_sparse_emb() diff --git a/tests/unit-tests/test_utils.py b/tests/unit-tests/test_utils.py index f3ad3ec5c8..d8ad6f65be 100644 --- a/tests/unit-tests/test_utils.py +++ b/tests/unit-tests/test_utils.py @@ -17,14 +17,18 @@ import tempfile import pytest import multiprocessing as mp +import h5py import torch as th import numpy as np +import dgl from numpy.testing import assert_equal +from dgl.distributed import DistTensor from graphstorm.model.utils import save_embeddings, LazyDistTensor, remove_saved_models, TopKList from graphstorm.model.utils import _get_data_range -from graphstorm.model.utils import _exchange_node_id_mapping +from graphstorm.model.utils import _exchange_node_id_mapping, distribute_nid_map from graphstorm.model.utils import shuffle_predict +from graphstorm.model.utils import pad_file_index from graphstorm.gconstruct.utils import save_maps from graphstorm import get_feat_size @@ -32,6 +36,8 @@ from graphstorm.eval.utils import gen_mrr_score from graphstorm.utils import setup_device +from graphstorm.gconstruct.file_io import stream_dist_tensors_to_hdf5 + def gen_embedding_with_nid_mapping(num_embs): emb = th.rand((num_embs, 12)) ori_nid_mapping = th.randperm(num_embs) @@ -123,7 +129,7 @@ def run_dist_exchange_node_id_mapping(worker_rank, world_size, backend, assert_equal(target_nid_mapping.numpy(), nid_mapping.cpu().numpy()) @pytest.mark.parametrize("num_embs", [100, 101]) -@pytest.mark.parametrize("backend", ["gloo"]) +@pytest.mark.parametrize("backend", ["gloo", "nccl"]) def test_exchange_node_id_mapping(num_embs, backend): node_id_mapping = th.randperm(num_embs) start, end = _get_data_range(0, 4, num_embs) @@ -157,6 +163,108 @@ def test_exchange_node_id_mapping(num_embs, backend): assert p2.exitcode == 0 assert p3.exitcode == 0 +def run_distribute_nid_map(embeddings, local_rank, world_size, + node_id_mapping_file, backend, target_nid_mapping): + dist_init_method = 'tcp://{master_ip}:{master_port}'.format( + master_ip='127.0.0.1', master_port='12345') + th.distributed.init_process_group(backend=backend, + init_method=dist_init_method, + world_size=world_size, + rank=local_rank) + device = setup_device(local_rank) + nid_mapping = distribute_nid_map(embeddings, local_rank, world_size, + node_id_mapping_file, device) + + if isinstance(embeddings, (dgl.distributed.DistTensor, LazyDistTensor)): + assert_equal(target_nid_mapping[local_rank].numpy(), nid_mapping.cpu().numpy()) + elif isinstance(embeddings, dict): + for name in embeddings.keys(): + assert_equal(target_nid_mapping[name][local_rank].numpy(), \ + nid_mapping[name].cpu().numpy()) + +@pytest.mark.parametrize("backend", ["gloo", "nccl"]) +def test_distribute_nid_map(backend): + # need to force to reset the fork context + # because dist tensor is the input for mulitiple processes + with tempfile.TemporaryDirectory() as tmpdirname: + # get the test dummy distributed graph + g, _ = generate_dummy_dist_graph(tmpdirname, size="tiny") + dummy_dist_embeds = {} + ori_nid_maps = {} + target_nid_maps = {} + for ntype in g.ntypes: + dummy_dist_embeds[ntype] = DistTensor((g.number_of_nodes(ntype), 5), + dtype=th.float32, name=f'ntype-{ntype}', + part_policy=g.get_node_partition_policy(ntype)) + ori_nid_maps[ntype] = th.randperm(g.number_of_nodes(ntype)) + + target_nid_maps[ntype] = [] + _, sorted_nid_map = th.sort(ori_nid_maps[ntype]) + for i in range(4): + start, end = _get_data_range(i, 4, g.number_of_nodes(ntype)) + target_nid_maps[ntype].append(sorted_nid_map[start:end].clone()) + + nid_map_dict_path = os.path.join(tmpdirname, "nid_map_dict.pt") + nid_map_tensor_path = os.path.join(tmpdirname, "nid_map_tensor.pt") + th.save(ori_nid_maps, nid_map_dict_path) + dummy_ntype = g.ntypes[0] + th.save(ori_nid_maps[dummy_ntype], nid_map_tensor_path) + + # when dummy_dist_embeds is a dict + ctx = mp.get_context('spawn') + p0 = ctx.Process(target=run_distribute_nid_map, + args=(dummy_dist_embeds, 0, 4, nid_map_dict_path, backend, \ + target_nid_maps)) + p1 = ctx.Process(target=run_distribute_nid_map, + args=(dummy_dist_embeds, 1, 4, nid_map_dict_path, backend, \ + target_nid_maps)) + p2 = ctx.Process(target=run_distribute_nid_map, + args=(dummy_dist_embeds, 2, 4, nid_map_dict_path, backend, \ + target_nid_maps)) + p3 = ctx.Process(target=run_distribute_nid_map, + args=(dummy_dist_embeds, 3, 4, nid_map_dict_path, backend, \ + target_nid_maps)) + p0.start() + p1.start() + p2.start() + p3.start() + p0.join() + p1.join() + p2.join() + p3.join() + assert p0.exitcode == 0 + assert p1.exitcode == 0 + assert p2.exitcode == 0 + assert p3.exitcode == 0 + + # when dummy_dist_embeds is a dist tensor + ctx2 = mp.get_context('spawn') + p4 = ctx2.Process(target=run_distribute_nid_map, + args=(dummy_dist_embeds[dummy_ntype], 0, 4, nid_map_tensor_path, \ + backend, target_nid_maps[dummy_ntype])) + p5 = ctx2.Process(target=run_distribute_nid_map, + args=(dummy_dist_embeds[dummy_ntype], 1, 4, nid_map_tensor_path, \ + backend, target_nid_maps[dummy_ntype])) + p6 = ctx2.Process(target=run_distribute_nid_map, + args=(dummy_dist_embeds[dummy_ntype], 2, 4, nid_map_tensor_path, \ + backend, target_nid_maps[dummy_ntype])) + p7 = ctx2.Process(target=run_distribute_nid_map, + args=(dummy_dist_embeds[dummy_ntype], 3, 4, nid_map_tensor_path, \ + backend, target_nid_maps[dummy_ntype])) + + p4.start() + p5.start() + p6.start() + p7.start() + p4.join() + p5.join() + p6.join() + p7.join() + assert p4.exitcode == 0 + assert p5.exitcode == 0 + assert p6.exitcode == 0 + assert p7.exitcode == 0 + def run_dist_save_embeddings(model_path, emb, worker_rank, world_size, node_id_mapping_file, backend): dist_init_method = 'tcp://{master_ip}:{master_port}'.format( @@ -266,7 +374,7 @@ def test_shuffle_predict(num_embs, backend): # TODO: Only test gloo now # Will add test for nccl once we enable nccl @pytest.mark.parametrize("num_embs", [16, 17]) -@pytest.mark.parametrize("backend", ["gloo"]) +@pytest.mark.parametrize("backend", ["gloo", "nccl"]) def test_save_embeddings_with_id_mapping(num_embs, backend): import tempfile @@ -289,8 +397,8 @@ def test_save_embeddings_with_id_mapping(num_embs, backend): assert p1.exitcode == 0 # Load saved embeddings - emb0 = th.load(os.path.join(tmpdirname, 'emb.part0.bin'), weights_only=True) - emb1 = th.load(os.path.join(tmpdirname, 'emb.part1.bin'), weights_only=True) + emb0 = th.load(os.path.join(tmpdirname, f'emb.part{pad_file_index(0)}.bin'), weights_only=True) + emb1 = th.load(os.path.join(tmpdirname, f'emb.part{pad_file_index(1)}.bin'), weights_only=True) saved_emb = th.cat([emb0, emb1], dim=0) assert len(saved_emb) == len(emb) assert_equal(emb[nid_mapping].numpy(), saved_emb.numpy()) @@ -329,20 +437,20 @@ def test_save_embeddings_with_id_mapping(num_embs, backend): assert p1.exitcode == 0 # Load saved embeddings - emb0 = th.load(os.path.join(tmpdirname, 'n0_emb.part0.bin'), weights_only=True) - emb1 = th.load(os.path.join(tmpdirname, 'n0_emb.part1.bin'), weights_only=True) + emb0 = th.load(os.path.join(tmpdirname, f'n0_emb.part{pad_file_index(0)}.bin'), weights_only=True) + emb1 = th.load(os.path.join(tmpdirname, f'n0_emb.part{pad_file_index(1)}.bin'), weights_only=True) saved_emb = th.cat([emb0, emb1], dim=0) assert len(saved_emb) == len(embs['n0']) assert_equal(embs['n0'][nid_mappings['n0']].numpy(), saved_emb.numpy()) - emb0 = th.load(os.path.join(tmpdirname, 'n1_emb.part0.bin'), weights_only=True) - emb1 = th.load(os.path.join(tmpdirname, 'n1_emb.part1.bin'), weights_only=True) + emb0 = th.load(os.path.join(tmpdirname, f'n1_emb.part{pad_file_index(0)}.bin'), weights_only=True) + emb1 = th.load(os.path.join(tmpdirname, f'n1_emb.part{pad_file_index(1)}.bin'), weights_only=True) saved_emb = th.cat([emb0, emb1], dim=0) assert len(saved_emb) == len(embs['n1']) assert_equal(embs['n1'][nid_mappings['n1']].numpy(), saved_emb.numpy()) - emb0 = th.load(os.path.join(tmpdirname, 'n2_emb.part0.bin'), weights_only=True) - emb1 = th.load(os.path.join(tmpdirname, 'n2_emb.part1.bin'), weights_only=True) + emb0 = th.load(os.path.join(tmpdirname, f'n2_emb.part{pad_file_index(0)}.bin'), weights_only=True) + emb1 = th.load(os.path.join(tmpdirname, f'n2_emb.part{pad_file_index(1)}.bin'), weights_only=True) saved_emb = th.cat([emb0, emb1], dim=0) assert len(saved_emb) == len(embs['n2']) assert_equal(embs['n2'][nid_mappings['n2']].numpy(), saved_emb.numpy()) @@ -359,11 +467,11 @@ def test_save_embeddings(): type0_random_emb, type1_random_emb = helper_save_embedding(tmpdirname) # Only work with torch 1.13+ - feats_type0 = [th.load(os.path.join(tmpdirname, "type0_emb.part{}.bin".format(i)), + feats_type0 = [th.load(os.path.join(tmpdirname, f"type0_emb.part{pad_file_index(i)}.bin"), weights_only=True) for i in range(4)] feats_type0 = th.cat(feats_type0, dim=0) # Only work with torch 1.13+ - feats_type1 = [th.load(os.path.join(tmpdirname, "type1_emb.part{}.bin".format(i)), + feats_type1 = [th.load(os.path.join(tmpdirname, f"type1_emb.part{pad_file_index(i)}.bin"), weights_only=True) for i in range(4)] feats_type1 = th.cat(feats_type1, dim=0) @@ -463,7 +571,44 @@ def test_gen_mrr_score(): assert th.isclose(metrics['mrr'], metrics_opti['mrr']) # Default tolerance: 1e-08 +def test_stream_dist_tensors_to_hdf5(): + with tempfile.TemporaryDirectory() as tmpdirname: + # get the test dummy distributed graph + # medium size has 1,000,000 nodes, which is enough (>chunk_size) + g, _ = generate_dummy_dist_graph(tmpdirname, size="medium") + + dummy_dist_embeds = {} + for ntype in g.ntypes: + dummy_dist_embeds[ntype] = DistTensor((g.number_of_nodes(ntype), 5), + dtype=th.float32, name=f'ntype-{ntype}', + part_policy=g.get_node_partition_policy(ntype)) + + # chunk size needs to be smaller than num of nodes + chunk_size = g.number_of_nodes(g.ntypes[0]) // 4 + stream_dist_tensors_to_hdf5(dummy_dist_embeds, os.path.join(tmpdirname, "embed_dict.hdf5"), \ + chunk_size=chunk_size) + + read_f = h5py.File(os.path.join(tmpdirname, "embed_dict.hdf5"), "r") + for ntype in g.ntypes: + assert g.number_of_nodes(ntype) == len(read_f[ntype]) + assert_equal(dummy_dist_embeds[ntype][0:len(dummy_dist_embeds[ntype])].numpy(), \ + read_f[ntype][0:]) + +def test_pad_file_index(): + assert pad_file_index(1) == "00001" + assert pad_file_index(111) == "00111" + assert pad_file_index(111, 4) == "0111" + fail = False + try: + pad_file_index(111, 0) + except: + fail = True + assert fail + if __name__ == '__main__': + test_distribute_nid_map(backend='gloo') + test_distribute_nid_map(backend='nccl') + test_shuffle_predict(num_embs=16, backend='gloo') test_shuffle_predict(num_embs=17, backend='nccl') @@ -478,3 +623,5 @@ def test_gen_mrr_score(): test_remove_saved_models() test_topklist() test_gen_mrr_score() + + test_stream_dist_tensors_to_hdf5()