diff --git a/.github/workflow_scripts/lint_check.sh b/.github/workflow_scripts/lint_check.sh
index 3eecaf93c3..f49c156926 100644
--- a/.github/workflow_scripts/lint_check.sh
+++ b/.github/workflow_scripts/lint_check.sh
@@ -4,6 +4,8 @@ cd ../../
 set -ex
 
 python3 -m pip install --upgrade prospector pip
+pip3 uninstall -y astroid
+yes | pip3 install astroid==2.15.7
 FORCE_CUDA=1 python3 -m pip install -e '.[test]'  --no-build-isolation
 pylint --rcfile=./tests/lint/pylintrc ./python/graphstorm/data/*.py
 pylint --rcfile=./tests/lint/pylintrc ./python/graphstorm/dataloading/
diff --git a/docker/build_docker_wholegraph.sh b/docker/build_docker_wholegraph.sh
new file mode 100644
index 0000000000..7caa3bef0a
--- /dev/null
+++ b/docker/build_docker_wholegraph.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+# process argument 1: graphstorm home folder
+if [ -z "$1" ]; then
+    echo "Please provide the graphstorm home folder that the graphstorm codes are cloned to."
+    echo "For example, ./build_docker_wholegraph.sh /graph-storm/"
+    exit 1
+else
+    GSF_HOME="$1"
+fi
+
+# process argument 2: docker image name, default is graphstorm
+if [ -z "$2" ]; then
+    IMAGE_NAME="graphstorm-wholegraph"
+else
+    IMAGE_NAME="$2"
+fi
+
+# process argument 3: image's tag name, default is local
+if [ -z "$3" ]; then
+    TAG="local"
+else
+    TAG="$3"
+fi
+
+# Copy scripts and tools codes to the docker folder
+mkdir -p $GSF_HOME"/docker/code"
+cp -r $GSF_HOME"/python" $GSF_HOME"/docker/code/python"
+cp -r $GSF_HOME"/inference_scripts" $GSF_HOME"/docker/code/inference_scripts"
+cp -r $GSF_HOME"/tools" $GSF_HOME"/docker/code/tools"
+cp -r $GSF_HOME"/training_scripts" $GSF_HOME"/docker/code/training_scripts"
+
+# Build OSS docker for EC2 instances that an pull ECR docker images
+DOCKER_FULLNAME="${IMAGE_NAME}:${TAG}"
+
+echo "Build a local docker image ${DOCKER_FULLNAME}"
+docker build --no-cache -f $GSF_HOME"/docker/wholegraph/Dockerfile" . -t $DOCKER_FULLNAME
+
+# remove the temporary code folder
+rm -rf $GSF_HOME"/docker/code"
diff --git a/docker/wholegraph/Dockerfile b/docker/wholegraph/Dockerfile
new file mode 100644
index 0000000000..43b2278426
--- /dev/null
+++ b/docker/wholegraph/Dockerfile
@@ -0,0 +1,60 @@
+FROM nvcr.io/nvidia/dgl:23.07-py3
+
+#################################################
+## Install EFA installer
+ARG EFA_INSTALLER_VERSION=latest
+RUN cd $HOME \
+    && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
+    && tar -xf $HOME/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz \
+    && cd aws-efa-installer \
+    && apt-get update \
+    && apt-get install -y libhwloc-dev \
+    && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \
+    && rm -rf /var/lib/apt/lists/*
+
+###################################################
+## Install AWS-OFI-NCCL plugin
+ARG AWS_OFI_NCCL_VERSION=v1.7.1-aws
+RUN git clone https://github.com/aws/aws-ofi-nccl.git /opt/aws-ofi-nccl \
+    && cd /opt/aws-ofi-nccl \
+    && git checkout ${AWS_OFI_NCCL_VERSION} \
+    && ./autogen.sh \
+    && ./configure --prefix=/opt/aws-ofi-nccl/ \
+       --with-libfabric=/opt/amazon/efa/ \
+       --with-cuda=/usr/local/cuda \
+    && make && make install
+
+ENV PATH "/opt/amazon/efa/bin:$PATH"
+
+# Install WholeGraph
+COPY wholegraph/install_wholegraph.sh install_wholegraph.sh
+RUN bash install_wholegraph.sh
+
+# Install GraphStorm
+RUN pip install --no-cache-dir boto3 'h5py>=2.10.0' scipy tqdm 'pyarrow>=3' 'transformers==4.28.1' pandas pylint scikit-learn ogb psutil
+RUN git clone https://github.com/awslabs/graphstorm
+
+# Increase nofile limit
+RUN echo "root                soft    nofile          1048576" >> /etc/security/limits.conf \
+    && echo "root                hard    nofile          1048576" >> /etc/security/limits.conf
+
+# Make EFA NCCL plugin the default plugin
+RUN sed -i '/nccl_rdma_sharp_plugin/d' /etc/ld.so.conf.d/hpcx.conf \
+    && echo "/opt/aws-ofi-nccl/lib" >> /etc/ld.so.conf.d/hpcx.conf \
+    && ldconfig
+
+# Set up SSH
+RUN apt-get update && apt-get install -y openssh-client openssh-server && rm -rf /var/lib/apt/lists/*
+ENV SSH_PORT=2222
+RUN cat /etc/ssh/sshd_config > /tmp/sshd_config && \
+    sed "0,/^#Port 22/s//Port ${SSH_PORT}/" /tmp/sshd_config > /etc/ssh/sshd_config
+ENV HOME=/root
+ENV SSHDIR $HOME/.ssh
+RUN mkdir -p ${SSHDIR}
+RUN ssh-keygen -t rsa -f ${SSHDIR}/id_rsa -N ''
+RUN cp ${SSHDIR}/id_rsa.pub ${SSHDIR}/authorized_keys
+RUN touch /root/.ssh/config;echo -e "Host *\n  StrictHostKeyChecking no\n  UserKnownHostsFile=/dev/null\n  Port ${SSH_PORT}" > /root/.ssh/config
+EXPOSE 2222
+RUN mkdir /run/sshd
+
+CMD ["/usr/sbin/sshd", "-D"]
diff --git a/docker/wholegraph/install_wholegraph.sh b/docker/wholegraph/install_wholegraph.sh
new file mode 100644
index 0000000000..e0d5b4fee2
--- /dev/null
+++ b/docker/wholegraph/install_wholegraph.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+git clone https://github.com/fmtlib/fmt.git  /opt/fmt
+cd /opt/fmt
+git checkout 9.1.0
+mkdir build && cd build
+cmake -DCMAKE_POSITION_INDEPENDENT_CODE=TRUE ..
+make
+make install
+
+git clone https://github.com/gabime/spdlog.git  /opt/spdlog
+cd /opt/spdlog && mkdir build && cd build
+cmake .. && make -j
+cp libspdlog.a  /usr/lib/libspdlog.a
+export PYTHON=/usr/bin/python
+
+cd  /opt/rapids/
+git clone https://github.com/rapidsai/wholegraph.git -b branch-23.08
+cd /opt/rapids/wholegraph/
+pip install scikit-build
+export WHOLEGRAPH_CMAKE_CUDA_ARCHITECTURES="70-real;80-real;90"
+# fix a bug in CMakeList.txt when build pylibwholegraph
+old="import sysconfig; print(sysconfig.get_config_var('BINLIBDEST'))"
+string="import sysconfig; print(\"%s/%s\" % (sysconfig.get_config_var(\"LIBDIR\"), sysconfig.get_config_var(\"INSTSONAME\")))"
+sed -i "s|$old|$string|" /opt/rapids/wholegraph/python/pylibwholegraph/CMakeLists.txt
+bash build.sh libwholegraph pylibwholegraph -v
diff --git a/docs/source/_templates/dataloadertemplate.rst b/docs/source/_templates/dataloadertemplate.rst
new file mode 100644
index 0000000000..f02d586215
--- /dev/null
+++ b/docs/source/_templates/dataloadertemplate.rst
@@ -0,0 +1,10 @@
+.. role:: hidden
+    :class: hidden-section
+.. currentmodule:: {{ module }}
+
+
+{{ name | underline}}
+
+.. autoclass:: {{ name }}
+    :show-inheritance:
+    :special-members: __iter__, __next__
\ No newline at end of file
diff --git a/docs/source/_templates/datasettemplate.rst b/docs/source/_templates/datasettemplate.rst
new file mode 100644
index 0000000000..b503bdbb1e
--- /dev/null
+++ b/docs/source/_templates/datasettemplate.rst
@@ -0,0 +1,10 @@
+.. role:: hidden
+    :class: hidden-section
+.. currentmodule:: {{ module }}
+
+
+{{ name | underline}}
+
+.. autoclass:: {{ name }}
+    :show-inheritance:
+    :members: prepare_data, get_node_feats, get_edge_feats, get_labels
diff --git a/docs/source/_templates/classtemplate.rst b/docs/source/_templates/evaltemplate.rst
similarity index 100%
rename from docs/source/_templates/classtemplate.rst
rename to docs/source/_templates/evaltemplate.rst
diff --git a/docs/source/_templates/inferencetemplate.rst b/docs/source/_templates/inferencetemplate.rst
new file mode 100644
index 0000000000..c5a289df3e
--- /dev/null
+++ b/docs/source/_templates/inferencetemplate.rst
@@ -0,0 +1,11 @@
+.. role:: hidden
+    :class: hidden-section
+.. currentmodule:: {{ module }}
+
+
+{{ name | underline}}
+
+.. autoclass:: {{ name }}
+    :show-inheritance:
+    :members: setup_device, setup_evaluator, evaluator, device, infer
+
diff --git a/docs/source/_templates/modeltemplate.rst b/docs/source/_templates/modeltemplate.rst
new file mode 100644
index 0000000000..06a2366ca1
--- /dev/null
+++ b/docs/source/_templates/modeltemplate.rst
@@ -0,0 +1,10 @@
+.. role:: hidden
+    :class: hidden-section
+.. currentmodule:: {{ module }}
+
+
+{{ name | underline}}
+
+.. autoclass:: {{ name }}
+    :show-inheritance:
+    :members: forward, save_model, restore_model, predict, create_optimizer
\ No newline at end of file
diff --git a/docs/source/_templates/trainertemplate.rst b/docs/source/_templates/trainertemplate.rst
new file mode 100644
index 0000000000..e4023be1c7
--- /dev/null
+++ b/docs/source/_templates/trainertemplate.rst
@@ -0,0 +1,12 @@
+.. role:: hidden
+    :class: hidden-section
+.. currentmodule:: {{ module }}
+
+
+{{ name | underline}}
+
+.. autoclass:: {{ name }}
+    :show-inheritance:
+    :members: setup_device, setup_evaluator, save_model, remove_saved_model, save_topk_models,
+              get_best_model_path, restore_model, evaluator, optimizer, device, fit, eval
+
diff --git a/docs/source/api/graphstorm.customized.rst b/docs/source/api/graphstorm.customized.rst
deleted file mode 100644
index 9172b469f0..0000000000
--- a/docs/source/api/graphstorm.customized.rst
+++ /dev/null
@@ -1,62 +0,0 @@
-.. _apicustomized:
-
-customized model APIs
-==========================
-
-    GraphStorm provides a set of APIs for users to integrate their own customized models with
-    the framework of GraphStorm, so that users' own models can leverage GraphStorm's easy-to-use
-    and distributed capabilities.
-
-    For how to modify users' own models, please refer to this :ref:`Use Your Own Model Tutorial
-    <use-own-models>`.
-
-    In general, there are three sets of APIs involved in programming customized models.
-
-    * Dataloaders: users need to extend GraphStorm's abstract node or edge dataloader to implement
-      their own graph samplers or mini_batch generators.
-    * Models: depending on specific GML tasks, users need to extend the corresponding ModelBase and
-      ModelInterface, and then implement the required abstract functions.
-    * Evaluators: if necessary, users can also extend the two evaluator templates to implement their
-      own performance evaluation method.
-
-.. currentmodule:: graphstorm
-
-Dataloaders
-------------
-.. autosummary::
-    :toctree: ../generated/
-    :nosignatures:
-    :template: classtemplate.rst
-
-    .. dataloading.AbsNodeDataLoader
-    .. dataloading.AbsEdgeDataLoader
-
-Models
-------------
-
-.. autosummary::
-    :toctree: ../generated/
-    :nosignatures:
-    :template: classtemplate.rst
-
-    model.GSgnnModelBase
-    model.GSgnnNodeModelBase
-    model.GSgnnEdgeModelBase
-    model.GSgnnLinkPredictionModelBase
-    model.GSgnnNodeModelInterface
-    model.GSgnnEdgeModelInterface
-    model.GSgnnLinkPredictionModelInterface
-
-Evaluators
-------------
-
-    If users want to implement customized evaluators or evaluation methods, a best practice is to
-    extend the ``eval.GSgnnInstanceEvaluator`` class, and implement the abstract methods.
-
-.. autosummary::
-    :toctree: ../generated/
-    :nosignatures:
-    :template: classtemplate.rst
-
-    eval.GSgnnInstanceEvaluator
-    eval.GSgnnLPEvaluator
\ No newline at end of file
diff --git a/docs/source/api/graphstorm.dataloading.rst b/docs/source/api/graphstorm.dataloading.rst
index 9e4b4175ed..b782602eb9 100644
--- a/docs/source/api/graphstorm.dataloading.rst
+++ b/docs/source/api/graphstorm.dataloading.rst
@@ -3,29 +3,48 @@
 graphstorm.dataloading
 ==========================
 
-    GraphStorm dataloading module includes a set of graph datasets and dataloaders for different
+    GraphStorm dataloading module includes a set of graph DataSets and DataLoaders for different
     graph machine learning tasks.
 
+    If users would like to customize DataLoaders, please extend those classes in the
+    :ref:`Base DataLoaders <basedataloaders>` section and customize their abstract methods.
+
 .. currentmodule:: graphstorm.dataloading
 
+.. _basedataloaders:
+
+Base DataLoaders
+-------------------
+
+.. autosummary::
+    :toctree: ../generated/
+    :nosignatures:
+    :template: dataloadertemplate.rst
+
+    GSgnnNodeDataLoaderBase
+    GSgnnEdgeDataLoaderBase
+    GSgnnLinkPredictionDataLoaderBase
+
 DataSets
 ------------
+
 .. autosummary::
     :toctree: ../generated/
     :nosignatures:
-    :template: classtemplate.rst
+    :template: datasettemplate.rst
 
     GSgnnNodeTrainData
     GSgnnNodeInferData
     GSgnnEdgeTrainData
     GSgnnEdgeInferData
 
-Dataloaders
+DataLoaders
 ------------
+
 .. autosummary::
     :toctree: ../generated/
     :nosignatures:
-    :template: classtemplate.rst
+    :template: dataloadertemplate.rst
 
     GSgnnNodeDataLoader
     GSgnnEdgeDataLoader
diff --git a/docs/source/api/graphstorm.eval.rst b/docs/source/api/graphstorm.eval.rst
new file mode 100644
index 0000000000..a8193f0f2c
--- /dev/null
+++ b/docs/source/api/graphstorm.eval.rst
@@ -0,0 +1,38 @@
+.. _apieval:
+
+graphstorm.eval
+=======================
+
+    GraphStorm provides built-in evaluation methods for different Graph Machine
+    Learning (GML) tasks.
+
+    If users want to implement customized evaluators or evaluation methods, a best practice is to
+    extend base evaluators, i.e., the ``GSgnnInstanceEvaluator`` class for node or edge prediction
+    tasks, and ``GSgnnLPEvaluator`` for link prediction tasks, and then implement the abstract methods.
+
+.. currentmodule:: graphstorm.eval
+
+Base Evaluators
+----------------
+
+.. autosummary::
+    :toctree: ../generated/
+    :nosignatures:
+    :template: evaltemplate.rst
+
+    GSgnnInstanceEvaluator
+    GSgnnLPEvaluator
+
+Evaluators
+-----------
+
+.. autosummary::
+    :toctree: ../generated/
+    :nosignatures:
+    :template: evaltemplate.rst
+
+    GSgnnLPEvaluator
+    GSgnnMrrLPEvaluator
+    GSgnnPerEtypeMrrLPEvaluator
+    GSgnnAccEvaluator
+    GSgnnRegressionEvaluator
diff --git a/docs/source/api/graphstorm.evaluator.rst b/docs/source/api/graphstorm.evaluator.rst
deleted file mode 100644
index 35549b2946..0000000000
--- a/docs/source/api/graphstorm.evaluator.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-.. _apievaluator:
-
-graphstorm.evaluator
-=======================
-
-    GraphStorm evaluators provides built-in evaluation methods for different Graph Machine
-    Learning (GML).
-
-.. currentmodule:: graphstorm.eval
-.. autosummary::
-    :toctree: ../generated/
-    :nosignatures:
-    :template: classtemplate.rst
-
-    GSgnnLPEvaluator
-    GSgnnMrrLPEvaluator
-    GSgnnPerEtypeMrrLPEvaluator
-    GSgnnAccEvaluator
-    GSgnnRegressionEvaluator
-
diff --git a/docs/source/api/graphstorm.inferrer.rst b/docs/source/api/graphstorm.inference.rst
similarity index 85%
rename from docs/source/api/graphstorm.inferrer.rst
rename to docs/source/api/graphstorm.inference.rst
index 7ec4d7f4e4..15ef5d8a88 100644
--- a/docs/source/api/graphstorm.inferrer.rst
+++ b/docs/source/api/graphstorm.inference.rst
@@ -1,6 +1,6 @@
-.. _apiinferrer:
+.. _apiinference:
 
-graphstorm.inferrer
+graphstorm.inference
 ====================
 
     GraphStorm inferrers assemble the distributed inference pipeline for different tasks.
@@ -13,7 +13,7 @@ graphstorm.inferrer
 .. autosummary::
     :toctree: ../generated/
     :nosignatures:
-    :template: classtemplate.rst
+    :template: inferencetemplate.rst
 
     GSgnnLinkPredictionInferrer
     GSgnnNodePredictionInferrer
diff --git a/docs/source/api/graphstorm.model.rst b/docs/source/api/graphstorm.model.rst
index 76053c15eb..b4052bbd1a 100644
--- a/docs/source/api/graphstorm.model.rst
+++ b/docs/source/api/graphstorm.model.rst
@@ -1,9 +1,9 @@
 .. _apimodel:
 
 graphstorm.model
-=================
+===================
 
-    A GraphStorm model normally contains three components:
+    A GraphStorm model may contain three components:
 
     * Input layer: a set of modules to convert input data for different use cases,
       e.g., embedding texture features.
@@ -11,9 +11,25 @@ graphstorm.model
     * Decoder: a set of modules to convert results from encoders for different tasks,
       e.g., classification, regression, or link prediction.
 
+    Currently GraphStorm releases the first two set of components.
+
+    If users would like to implement their own model, the best practice is to extend the corresponding ``***ModelBase``, and implement the abstract methods.
+
 .. currentmodule:: graphstorm.model
 
-Model input layers
+Base models
+------------
+
+.. autosummary::
+    :toctree: ../generated/
+    :nosignatures:
+    :template: modeltemplate.rst
+
+    GSgnnNodeModelBase
+    GSgnnEdgeModelBase
+    GSgnnLinkPredictionModelBase
+
+Input Layers
 -------------------
 .. autosummary::
     :toctree: ../generated/
@@ -24,7 +40,7 @@ Model input layers
     GSLMNodeEncoderInputLayer
     GSPureLMNodeInputLayer
 
-Model encoders and layers
+Encoders and GNN Layers
 --------------------------
 .. autosummary::
     :toctree: ../generated/
diff --git a/docs/source/api/graphstorm.rst b/docs/source/api/graphstorm.rst
index 5de2db120d..2362fdee44 100644
--- a/docs/source/api/graphstorm.rst
+++ b/docs/source/api/graphstorm.rst
@@ -9,13 +9,12 @@ graphstorm
     Users can directly use the following code to use these functions.
 
     >>> import graphstorm as gs
-    >>> gs.initialize()
-    >>> gs.get_rank()
+    >>> gs.initialize(ip_config="/tmp/ip_list.txt", backend="gloo")
+    >>> gs.setup_device(local_rank)
 
 .. autosummary::
     :toctree: ../generated/
+    :nosignatures:
 
     gsf.initialize
-    gsf.get_feat_size
-    utils.get_rank
-    utils.get_world_size
+    utils.setup_device
diff --git a/docs/source/api/graphstorm.trainer.rst b/docs/source/api/graphstorm.trainer.rst
index a90d5c23ff..fd66109f62 100644
--- a/docs/source/api/graphstorm.trainer.rst
+++ b/docs/source/api/graphstorm.trainer.rst
@@ -11,32 +11,15 @@ graphstorm.trainer
 
 .. currentmodule:: graphstorm.trainer
 
-
-Base class
+Trainers
 --------------
-.. autosummary::
-    :toctree: ../generated/
-    :nosignatures:
-    :template: classtemplate.rst
-
-    GSgnnTrainer
 
-Task classes
------------------
 .. autosummary::
     :toctree: ../generated/
     :nosignatures:
-    :template: classtemplate.rst
+    :template: trainertemplate.rst
 
     GSgnnLinkPredictionTrainer
     GSgnnNodePredictionTrainer
     GSgnnEdgePredictionTrainer
-
-Method classes
------------------
-.. autosummary::
-    :toctree: ../generated/
-    :nosignatures:
-    :template: classtemplate.rst
-
     GLEMNodePredictionTrainer
diff --git a/docs/source/configuration/configuration-gconstruction.rst b/docs/source/configuration/configuration-gconstruction.rst
index 64feca1f97..b163b08d9f 100644
--- a/docs/source/configuration/configuration-gconstruction.rst
+++ b/docs/source/configuration/configuration-gconstruction.rst
@@ -87,7 +87,7 @@ Currently, the graph construction pipeline supports the following feature transf
 
 * **HuggingFace tokenizer transformation** tokenizes text strings with a HuggingFace tokenizer. The ``name`` field in the feature transformation dictionary is ``tokenize_hf``. The dict should contain two additional fields. ``bert_model`` specifies the LM model used for tokenization. Users can choose any `HuggingFace LM models <https://huggingface.co/models>`_ from one of the following types: ``"bert", "roberta", "albert", "camembert", "ernie", "ibert", "luke", "mega", "mpnet", "nezha", "qdqbert","roc_bert"``. ``max_seq_length`` specifies the maximal sequence length.
 * **HuggingFace LM transformation** encodes text strings with a HuggingFace LM model.  The ``name`` field in the feature transformation dictionary is ``bert_hf``. The dict should contain two additional fields. ``bert_model`` specifies the LM model used for embedding text. Users can choose any `HuggingFace LM models <https://huggingface.co/models>`_ from one of the following types: ``"bert", "roberta", "albert", "camembert", "ernie", "ibert", "luke", "mega", "mpnet", "nezha", "qdqbert","roc_bert"``. ``max_seq_length`` specifies the maximal sequence length.
-* **Numerical MAX_MIN transformation** normalizes numerical input features with `val = (val-min)/(max-min)`, where `val` is the feature value, `max` is the maximum number in the feature and `min` is the minimum number in the feature. The ``name`` field in the feature transformation dictionary is ``max_min_norm``. The dict can contains two optional fields. ``max_bound`` specifies the maximum value allowed in the feature. Any number larger than ``max_bound`` will be set to ``max_bound``. ``min_bound`` specifies the minimum value allowed in the feature. Any number smaller than ``min_bound`` will be set to ``min_bound``.
+* **Numerical MAX_MIN transformation** normalizes numerical input features with `val = (val-min)/(max-min)`, where `val` is the feature value, `max` is the maximum number in the feature and `min` is the minimum number in the feature. The ``name`` field in the feature transformation dictionary is ``max_min_norm``. The dict can contain four optional fields: ``max_bound``, ``min_bound``, ``max_val`` and ``min_val``. ``max_bound`` specifies the maximum value allowed in the feature. Any number larger than ``max_bound`` will be set to ``max_bound``. Here, `max` = min(np.amax(feats), ``max_bound``). ``min_bound`` specifies the minimum value allowed in the feature. Any number smaller than ``min_bound`` will be set to ``min_bound``. Here, `min` = max(np.amin(feats), ``min_bound``). ``max_val`` defines the `max` in the transformation formula. When ``max_val`` is provided, `max` is always equal to ``max_val``. ``min_val`` defines the `min` in the transformation formula.  When ``min_val`` is provided, `min` is always equal to ``min_val``. ``max_val`` and ``min_val`` are mainly used in the inference stage, where we want to use the max & min values computed in the training stage to normalize inference data.
 * **Numerical Rank Gauss transformation** normalizes numerical input features with rank gauss normalization. It maps the numeric feature values to gaussian distribution based on ranking. The method follows https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/discussion/44629#250927. The ``name`` field in the feature transformation dictionary is ``rank_gauss``. The dict can contains one optional field, i.e., ``epsilon`` which is used to avoid INF float during computation.
 * **Convert to categorical values** converts text data to categorial values. The `name` field is `to_categorical`. `separator` specifies how to split the string into multiple categorical values (this is only used to define multiple categorical values). If `separator` is not specified, the entire string is a categorical value. `mapping` is a dict that specifies how to map a string to an integer value that defines a categorical value.
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 1ebee6708d..f745ad9913 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -33,18 +33,17 @@ Welcome to the GraphStorm Documentation and Tutorials
    advanced/advanced-usages
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
    :caption: API Reference
    :hidden:
    :glob:
 
    api/graphstorm
    api/graphstorm.dataloading
+   api/graphstorm.eval
+   api/graphstorm.inference
    api/graphstorm.model
    api/graphstorm.trainer
-   api/graphstorm.inferrer
-   api/graphstorm.evaluator
-   api/graphstorm.customized
 
 GraphStorm is a graph machine learning (GML) framework designed for enterprise use cases. It simplifies the development, training and deployment of GML models on industry-scale graphs (measured in billons of nodes and edges) by providing scalable training and inference pipelines of GML models. GraphStorm comes with a collection of built-in GML models, allowing users to train a GML model with a single command, eliminating the need to write any code. Moreover, GraphStorm provides a wide range of configurations to customiz model implementations and training pipelines, enhancing model performance. In addition, GraphStorm offers a programming interface that enables users to train custom GML models in a distributed manner. Users can bring their own model implementations and leverage the GraphStorm training pipeline for scalability.
 
diff --git a/examples/mag/README.md b/examples/mag/README.md
index ab45834787..396cccf718 100644
--- a/examples/mag/README.md
+++ b/examples/mag/README.md
@@ -110,7 +110,7 @@ python3 -m graphstorm.run.gs_node_classification \
 
 The accuracy is 41.88%.
 
-### Fine-tune BERT model on the graph data and train GNN model to predict the venue
+### <a name="bert-ft-gnn"></a>Fine-tune BERT model on the graph data and train GNN model to predict the venue
 
 To achieve good performance, we should fine-tune the BERT model on the graph data.
 One way of fine-tuning the BERT model on the graph data is to fine-tune the BERT model
@@ -188,3 +188,42 @@ python3 -m graphstorm.run.gs_node_classification \
 
 The accuracy of RGCN with the BERT model fine-tuned with venue prediction is 63.22%,
 while the accuracy of HGT is 67.20%.
+
+### Co-training BERT and GNN models using GLEM to predict the venue
+
+[GLEM](https://arxiv.org/abs/2210.14709) is a variational EM framework that trains a LM and GNN iteratively for semi-supervised node classification. There are two important pre-requisite for achieve good performance with GLEM
+
+1. The pseudolabeling technique: it predicts pseudolabels on the unlabeled nodes and uses as additional supervision signal for mutual distillation between LM and GNN. This can be enabled by the `--use-pseudolabel true` argument in command line. 
+2. Well pre-trained LM and GNN before the co-training: empirically, LM or GNN models that are not well-trained lead to degraded performance when co-training with GLEM directly. Therefore, we suggest user to pre-train the LM and GNN first. This can be achieved by:
+	1. Setting `num_pretrain_epochs` in the [yaml config](mag_glem_w_pretrain.yaml). 
+
+	```
+	python3 -m graphstorm.run.gs_node_classification \
+				--num-trainers 8 \
+				--num-servers 4 \
+				--num-samplers 0 \
+				--part-config mag_min_4parts/mag.json \
+				--ip-config ip_list_4p.txt \
+				--cf mag_glem_w_pretrain.yaml \
+				--use-pseudolabel true
+	```
+
+	2. Restoring pretrained model from checkpoints using `--restore-model-path`. In the following example, we restore the GNN trained on fine-tuned BERT model in the [previous section](#bert-ft-gnn). GLEM requires checkpoints of LM and GNN to be in the same path, under separate directories `LM` and `GNN`. It then loads the LM's `node_input_encoder` and GNN's `gnn_encoder` and `decoder`. Since our GNN checkpoint contain both the fine-tuned LM and GNN, we set up softlinks to point both LM and GNN to this checkpiont. 
+
+	```
+	# prepare paths to pretrained models:
+	mkdir mag_pretrained_models
+	ln -s mag_gnn_nc_model/epoch-7 mag_pretrained_models/LM
+	ln -s mag_gnn_nc_model/epoch-7 mag_pretrained_models/GNN
+	
+	# co-training pre-trained LM and GNN with GLEM:
+	python3 -m graphstorm.run.gs_node_classification \
+				--num-trainers 8 \
+				--num-servers 4 \
+				--num-samplers 0 \
+				--part-config mag_min_4parts/mag.json \
+				--ip-config ip_list_4p.txt \
+				--cf mag_glem_nc.yaml \
+				--use-pseudolabel true \
+				--restore-model-path mag_pretrained_models
+	```
\ No newline at end of file
diff --git a/examples/mag/mag_glem_nc.yaml b/examples/mag/mag_glem_nc.yaml
new file mode 100644
index 0000000000..938ca1cbcb
--- /dev/null
+++ b/examples/mag/mag_glem_nc.yaml
@@ -0,0 +1,62 @@
+---
+version: 1.0
+lm_model:
+  node_lm_models:
+    -
+      lm_type: bert
+      model_name: "bert-base-uncased"
+      gradient_checkpoint: false
+      node_types:
+        - paper
+    -
+      lm_type: bert
+      model_name: "bert-base-uncased"
+      gradient_checkpoint: false
+      node_types:
+        - fos
+gsf:
+  basic:
+    backend: gloo
+    verbose: false
+    save_perf_results_path: null
+  lmgnn:
+    lm_train_nodes: 128
+    lm_infer_batch_size: 128
+    freeze_lm_encoder_epochs: 0
+    model_encoder_type: rgcn
+    fanout: "5,5"
+    num_layers: 2
+    hidden_size: 128
+    use_mini_batch_infer: false    
+    training_method:
+      name: glem
+      kwargs:
+        em_order_gnn_first: false
+        inference_using_gnn: true
+        pl_weight: 0.5
+        num_pretrain_epochs: 0
+  input:
+    restore_model_path: null
+  output:
+    save_model_path: null
+    save_embed_path: null
+  hyperparam:
+    dropout: 0.
+    lr: 0.00003
+    lm_tune_lr: 0.00003
+    sparse_optimizer_lr: 0.01
+    num_epochs: 3
+    batch_size: 128
+    eval_batch_size: 128
+    wd_l2norm: 0
+    no_validation: false
+  rgcn:
+    num_bases: -1
+    use_self_loop: false
+    lp_decoder_type: dot_product
+    use_node_embeddings: false
+  node_classification:
+    target_ntype: "paper"
+    label_field: "venue"
+    multilabel: false
+    num_classes: 1523
\ No newline at end of file
diff --git a/examples/mag/mag_glem_w_pretrain.yml b/examples/mag/mag_glem_w_pretrain.yml
new file mode 100644
index 0000000000..094bbb931b
--- /dev/null
+++ b/examples/mag/mag_glem_w_pretrain.yml
@@ -0,0 +1,62 @@
+---
+version: 1.0
+lm_model:
+  node_lm_models:
+    -
+      lm_type: bert
+      model_name: "bert-base-uncased"
+      gradient_checkpoint: false
+      node_types:
+        - paper
+    -
+      lm_type: bert
+      model_name: "bert-base-uncased"
+      gradient_checkpoint: false
+      node_types:
+        - fos
+gsf:
+  basic:
+    backend: gloo
+    verbose: false
+    save_perf_results_path: null
+  lmgnn:
+    lm_train_nodes: 64
+    lm_infer_batch_size: 64
+    freeze_lm_encoder_epochs: 0
+    model_encoder_type: rgcn
+    fanout: "5,5"
+    eval_fanout: "20,20"
+    num_layers: 2
+    hidden_size: 128
+    use_mini_batch_infer: false
+    training_method:
+      name: glem
+      kwargs:
+        em_order_gnn_first: false
+        inference_using_gnn: true
+        pl_weight: 0.5
+        num_pretrain_epochs: 10
+  input:
+    restore_model_path: null
+  output:
+    save_model_path: null
+    save_embed_path: null
+  hyperparam:
+    dropout: 0.
+    lr: 0.00003
+    lm_tune_lr: 0.00003
+    sparse_optimizer_lr: 0.01
+    num_epochs: 3
+    batch_size: 64
+    eval_batch_size: 64
+    wd_l2norm: 0
+    no_validation: false
+  rgcn:
+    num_bases: -1
+    use_self_loop: true
+    use_node_embeddings: false
+  node_classification:
+    target_ntype: "paper"
+    label_field: "venue"
+    multilabel: false
+    num_classes: 1523
\ No newline at end of file
diff --git a/graphstorm-processing/docs/Makefile b/graphstorm-processing/docs/Makefile
new file mode 100644
index 0000000000..d0c3cbf102
--- /dev/null
+++ b/graphstorm-processing/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/graphstorm-processing/docs/make.bat b/graphstorm-processing/docs/make.bat
new file mode 100644
index 0000000000..6247f7e231
--- /dev/null
+++ b/graphstorm-processing/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/graphstorm-processing/docs/source/conf.py b/graphstorm-processing/docs/source/conf.py
new file mode 100644
index 0000000000..7334ba97ae
--- /dev/null
+++ b/graphstorm-processing/docs/source/conf.py
@@ -0,0 +1,53 @@
+# pylint: skip-file
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'graphstorm-processing'
+copyright = '2023, AGML Team'
+author = 'AGML Team, Amazon'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
diff --git a/graphstorm-processing/docs/source/developer/developer-guide.rst b/graphstorm-processing/docs/source/developer/developer-guide.rst
new file mode 100644
index 0000000000..1a7faf85db
--- /dev/null
+++ b/graphstorm-processing/docs/source/developer/developer-guide.rst
@@ -0,0 +1,230 @@
+Developer Guide
+---------------
+
+The project is set up using ``poetry`` to make easier for developers to
+jump into the project.
+
+The steps we recommend are:
+
+Install JDK 8, 11
+~~~~~~~~~~~~~~~~~
+
+PySpark requires a compatible Java installation to run, so
+you will need to ensure your active JDK is using either
+Java 8 or 11.
+
+On MacOS you can do this using ``brew``:
+
+.. code-block:: bash
+
+    brew install openjdk@11
+
+On Linux it will depend on your distribution's package
+manager. For Ubuntu you can use:
+
+.. code-block:: bash
+
+    sudo apt install openjdk-11-jdk
+
+On Amazon Linux 2 you can use:
+
+.. code-block:: bash
+
+    sudo yum install java-11-amazon-corretto-headless
+    sudo yum install java-11-amazon-corretto-devel
+
+Install ``pyenv``
+~~~~~~~~~~~~~
+
+``pyenv`` is a tool to manage multiple Python version installations. It
+can be installed through the installer below on a Linux machine:
+
+.. code-block:: bash
+
+   curl -L https://github.com/pyenv/pyenv-installer/raw/master/bin/pyenv-installer | bash
+
+or use ``brew`` on a Mac:
+
+.. code-block:: bash
+
+   brew update
+   brew install pyenv
+
+For more info on ``pyenv`` see `its documentation. <https://github.com/pyenv/pyenv>`
+
+Create a Python 3.9 env and activate it.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We use Python 3.9 in our images so this most closely resembles the
+execution environment on our Docker images that will be used for distributed
+training.
+
+.. code-block:: bash
+
+   pyenv install 3.9
+   pyenv global 3.9
+
+..
+
+   Note: We recommend not mixing up ``conda`` and ``pyenv``. When developing for
+   this project, simply ``conda deactivate`` until there's no ``conda``
+   env active (even ``base``) and just rely on ``pyenv`` and ``poetry`` to handle
+   dependencies.
+
+Install ``poetry``
+~~~~~~~~~~~~~~
+
+``poetry`` is a dependency and build management system for Python. To install it
+use:
+
+.. code-block:: bash
+
+   curl -sSL https://install.python-poetry.org | python3 -
+
+Install dependencies through ``poetry``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Now we are ready to install our dependencies through ``poetry``.
+
+We have split the project dependencies into the “main” dependencies that
+``poetry`` installs by default, and the ``dev`` dependency group that
+installs that dependencies that are only needed to develop the library.
+
+**On a POSIX system** (tested on Ubuntu, CentOS, MacOS) run:
+
+.. code-block:: bash
+
+   # Install all dependencies into local .venv
+   poetry install --with dev
+
+Once all dependencies are installed you should be able to run the unit
+tests for the project and continue with development using:
+
+.. code-block:: bash
+
+   poetry run pytest ./graphstorm-processing/tests
+
+You can also activate and use the virtual environment using:
+
+.. code-block:: bash
+
+   poetry shell
+   # We're now using the graphstorm-processing-py3.9 env so we can just run
+   pytest ./graphstorm-processing/tests
+
+To learn more about ``poetry`` see its `documentation <https://python-poetry.org/docs/basic-usage/>`_
+
+Use ``black`` to format code [optional]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We use `black <https://black.readthedocs.io/en/stable/index.html>`_ to
+format code in this project. ``black`` is an opinionated formatter that
+helps speed up development and code reviews. It is included in our
+``dev`` dependencies so it will be installed along with the other dev
+dependencies.
+
+To use ``black`` in the project you can run (from the project's root,
+same level as ``pyproject.toml``)
+
+.. code-block:: bash
+
+   # From the project's root directory, graphstorm-processing run:
+   black .
+
+To get a preview of the changes ``black`` would make you can use:
+
+.. code-block:: bash
+
+   black . --diff --color
+
+You can auto-formatting with ``black`` to VSCode using the `Black
+Formatter <https://marketplace.visualstudio.com/items?itemName=ms-python.black-formatter>`__
+
+
+Use mypy and pylint to lint code
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We include the ``mypy`` and ``pylint`` linters as a dependency under the ``dev`` group
+of dependencies. These linters perform static checks on your code and
+can be used in a complimentary manner.
+
+We recommend `using VSCode and enabling the mypy linter <https://code.visualstudio.com/docs/python/linting#_general-settings>`_
+to get in-editor annotations.
+
+You can also lint the project code through:
+
+.. code-block:: bash
+
+   poetry run mypy ./graphstorm_processing
+
+To learn more about ``mypy`` and how it can help development
+`see its documentation <https://mypy.readthedocs.io/en/stable/>`_.
+
+
+Our goal is to minimize ``mypy`` errors as much as possible for the
+project. New code should be linted and not introduce additional mypy
+errors. When necessary it's OK to use ``type: ignore`` to silence
+``mypy`` errors inline, but this should be used sparingly.
+
+As a project, GraphStorm requires a 10/10 pylint score, so
+ensure your code conforms to the expectation by running
+
+.. code-block:: bash
+
+    pylint --rcfile=/path/to/graphstorm/tests/lint/pylintrc
+
+on your code before commits. To make this easier we include
+a pre-commit hook below.
+
+Use a pre-commit hook to ensure ``black`` and ``pylint`` runs before commits
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To make code formatting and ``pylint`` checks easier for graphstorm-processing
+developers, we recommend using a pre-commit hook.
+
+We include ``pre-commit`` in the project's ``dev`` dependencies, so once
+you have activated the project's venv (``poetry shell``) you can just
+create a file named ``.pre-commit-config.yaml`` with the following contents:
+
+.. code-block:: yaml
+
+    # .pre-commit-config.yaml
+    repos:
+        - repo: https://github.com/psf/black
+            rev: 23.7.0
+            hooks:
+            - id: black
+                language_version: python3.9
+                files: 'graphstorm_processing\/.*\.pyi?$|tests\/.*\.pyi?$|scripts\/.*\.pyi?$'
+                exclude: 'python\/.*\.pyi'
+        - repo: local
+            hooks:
+            - id: pylint
+                name: pylint
+                entry: pylint
+                language: system
+                types: [python]
+                args:
+                [
+                    "--rcfile=./tests/lint/pylintrc"
+                ]
+
+
+And then run:
+
+.. code-block:: bash
+
+   pre-commit install
+
+which will install the ``black`` and ``pylin`` hooks into your local repository and
+ensure it runs before every commit.
+
+.. note::
+
+    The pre-commit hook will also apply to all commits you make to the root
+    GraphStorm repository. Since that Graphstorm doesn't use ``black``, you might
+    want to remove the hooks. You can do so from the root repo
+    using ``rm -rf .git/hooks``.
+
+    Both projects use ``pylint`` to check Python files so we'd still recommend using
+    that hook even if you're doing development for both GSProcessing and GraphStorm.
diff --git a/graphstorm-processing/docs/source/developer/input-configuration.rst b/graphstorm-processing/docs/source/developer/input-configuration.rst
new file mode 100644
index 0000000000..e6e2d7ae98
--- /dev/null
+++ b/graphstorm-processing/docs/source/developer/input-configuration.rst
@@ -0,0 +1,430 @@
+..  _input-configuration:
+
+GraphStorm Processing Input Configuration
+=========================================
+
+GraphStorm Processing uses a JSON configuration file to
+parse and process the data into the format needed
+by GraphStorm partitioning and training downstream.
+
+We use this configuration format as an intermediate
+between other config formats, such as the one used
+by the single-machine GConstruct module.
+
+GSProcessing can take a GConstruct-formatted file
+directly, and we also provide `a script <https://github.com/awslabs/graphstorm/blob/main/graphstorm-processing/scripts/convert_gconstruct_config.py>`
+that can convert a `GConstruct <https://graphstorm.readthedocs.io/en/latest/configuration/configuration-gconstruction.html#configuration-json-explanations>`
+input configuration file into the ``GSProcessing`` format,
+although this is mostly aimed at developers, users are
+can rely on the automatic conversion.
+
+The GSProcessing input data configuration has two top-level objects:
+
+.. code-block:: json
+
+   {
+     "version": "gsprocessing-v1.0",
+     "graph": {}
+   }
+
+-  ``version`` (String, required): The version of configuration file being used. We include
+   the package name to allow self-contained identification of the file format.
+-  ``graph`` (JSON object, required): one configuration object that defines each
+   of the node types and edge types that describe the graph.
+
+We describe the ``graph`` object next.
+
+``graph`` configuration object
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``graph`` configuration object can have two top-level objects:
+
+.. code-block:: json
+
+   {
+     "edges": [{}],
+     "nodes": [{}]
+   }
+
+-  ``edges``: (array of JSON objects, required). Each JSON object
+   in this array describes one edge type and determines how the edge
+   structure will be parsed.
+-  ``nodes``: (array of JSON objects, optional). Each JSON object
+   in this array describes one node type. This key is optional, in case
+   it is missing, node IDs are derived from the ``edges`` objects.
+
+--------------
+
+Contents of an ``edges`` configuration object
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+An ``edges`` configuration object can contain the following top-level
+objects:
+
+.. code-block:: json
+
+   {
+     "data": {
+       "format": "String",
+       "files": ["String"],
+       "separator": "String"
+     },
+     "source": {"column": "String", "type": "String"},
+     "relation": {"type": "String"},
+     "destination": {"column": "String", "type": "String"},
+     "labels" : [
+            {
+                "column": "String",
+                "type": "String",
+                "split_rate": {
+                    "train": "Float",
+                    "val": "Float",
+                    "test": "Float"
+                }
+            },
+      ]
+       "features": [{}]
+   }
+
+-  ``data`` (JSON Object, required): Describes the physical files
+   that store the data described in this object. The JSON object has two
+   top level objects:
+
+   -  ``format`` (String, required): indicates the format the data is
+      stored in. We accept either ``"csv"`` or ``"parquet"`` as valid
+      file formats.
+
+   -  ``files`` (array of String, required): the physical location of
+      files. The format accepts two options:
+
+      -  a single-element list a with directory-like (ending in ``/``)
+         **relative** path under which all the files that correspond to
+         the current edge type are stored.
+
+         -  e.g. ``"files": ['path/to/edge/type/']``
+         -  This option allows for concise listing of entire types and
+            would be preferred. All the files under the path will be loaded.
+
+      -  a multi-element list of **relative** file paths.
+
+         -  ``"files": ['path/to/edge/type/file_1.csv', 'path/to/edge/type/file_2.csv']``
+         -  This option allows for multiple types to be stored under the
+            same input prefix, but will result in more verbose spec
+            files.
+
+      -  Since the spec expects **relative paths**, the caller is
+         responsible for providing a path prefix to the execution
+         engine. The prefix will determine if the source is a local
+         filesystem or S3, allowing the spec to be portable, i.e. a user
+         can move the physical files and the spec will still be valid,
+         as long as the relative structure is kept.
+
+   -  ``separator`` (String, optional): Only relevant for CSV files,
+      determines the separator used between each column in the files.
+
+-  ``source``: (JSON object, required): Describes the source nodes
+   for the edge type. The top-level keys for the object are:
+
+   -  ``column``: (String, required) The name of the column in the
+      physical data files.
+   -  ``type``: (String, optional) The type name of the nodes. If not
+      provided, we assume that the column name is the type name.
+
+-  ``destination``: (JSON object, required): Describes the
+   destination nodes for the edge type. Its format is the same as the
+   ``source`` key, with a JSON object that contains
+   ``{“column: String, and ”type“: String}``.
+-  ``relation``: (JSON object, required): Describes the relation
+   modeled by the edges. A relation can be common among all edges, or it
+   can have sub-types. The top-level objects for the object are:
+
+   -  ``type`` (String, required): The type of the relation described by
+      the edges. For example, for a source type ``user``, destination
+      ``movie`` we can have a relation type ``interacted_with`` for an
+      edge type ``user:interacted_with:movie``.
+
+-  ``labels`` (List of JSON objects, optional): Describes the label
+   for the current edge type. The label object has the following
+   top-level objects:
+
+   -  ``column`` (String, required): The column that contains the values
+      for the label. Should be the empty string, ``""`` if the ``type``
+      key has the value ``"link_prediction"``.
+   -  ``type`` (String, required): The type of the learning task. Can
+      take the following String values:
+
+      -  ``“classification”``: An edge classification task. The values
+         in the specified ``column`` as treated as categorical
+         variables.
+      -  ``"regression"``: An edge regression task. The values in the
+         specified ``column`` are treated as numerical values.
+      -  ``"link_prediction"``: A link prediction tasks. The ``column``
+         should be ``""`` in this case.
+
+   -  ``separator``: (String, optional): For multi-label classification
+      tasks, this separator is used within the column to list multiple
+      classification labels in one entry.
+   -  ``split_rate`` (JSON object, optional): Defines a split rate
+      for the label items. The sum of the values for ``train``, ``val`` and
+      ``test`` needs to be 1.0.
+
+      -  ``train``: The percentage of the data with available labels to
+         assign to the train set (0.0, 1.0].
+      -  ``val``: The percentage of the data with available labels to
+         assign to the train set [0.0, 1.0).
+      -  ``test``: The percentage of the data with available labels to
+         assign to the train set [0.0, 1.0).
+
+-  ``features`` (List of JSON objects, optional)\ **:** Describes
+   the set of features for the current edge type. See the :ref:`features-object` section for details.
+
+--------------
+
+Contents of a ``nodes`` configuration object
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A node configuration object in a ``nodes`` field can contain the
+following top-level keys:
+
+.. code-block:: json
+
+    {
+        "data": {
+            "format": "String",
+            "files": ["String"],
+            "separator": "String"
+        },
+        "column" : "String",
+        "type" : "String",
+        "labels" : [
+            {
+                "column": "String",
+                "type": "String",
+                "separator": "String",
+                "split_rate": {
+                    "train": "Float",
+                    "val": "Float",
+                    "test": "Float"
+                }
+            }
+        ],
+        "features": [{}]
+    }
+
+-  ``data``: (JSON object, required): Has the same definition as for
+   the edges object, with one top-level key for the ``format`` that
+   takes a String value, and one for the ``files`` that takes an array
+   of String values.
+-  ``column``: (String, required): The column in the data that
+   corresponds to the column that stores the node ids.
+-  ``type:`` (String, optional): A type name for the nodes described
+   in this object. If not provided the ``column`` value is used as the
+   node type.
+-  ``labels``: (List of JSON objects, optional): Similar to the
+   labels object defined for edges, but the values that the ``type`` can
+   take are different.
+
+   -  ``column`` (String, required): The name of the column that
+      contains the label values.
+   -  ``type`` (String, required): Specifies that target task type which
+      can be:
+
+      -  ``"classification"``: A node classification task. The values in the specified
+         ``column`` are treated as categorical variables.
+      -  ``"regression"``: A node regression task. The values in the specified
+         ``column`` are treated as float values.
+
+   -  ``separator`` (String, optional): For multi-label
+      classification tasks, this separator is used within the column to
+      list multiple classification labels in one entry.
+
+      -  e.g. with separator ``|`` we can have ``action|comedy`` as a
+         label value.
+
+   -  ``split_rate`` (JSON object, optional): Defines a split rate
+      for the label items. The sum of the values for ``train``, ``val`` and
+      ``test`` needs to be 1.0.
+
+      -  ``train``: The percentage of the data with available labels to
+         assign to the train set (0.0, 1.0].
+      -  ``val``: The percentage of the data with available labels to
+         assign to the train set [0.0, 1.0).
+      -  ``test``: The percentage of the data with available labels to
+         assign to the train set [0.0, 1.0).
+
+-  ``features`` (List of JSON objects, optional): Describes
+   the set of features for the current edge type. See the next section, :ref:`features-object`
+   for details.
+
+--------------
+
+.. _features-object:
+
+Contents of a ``features`` configuration object
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+An element of a ``features`` configuration object (for edges or nodes)
+can contain the following top-level keys:
+
+.. code-block:: json
+
+    {
+        "column": "String",
+        "name": "String",
+        "transformation": {
+        "name": "String",
+        "kwargs": {
+            "arg_name": "<value>"
+        }
+        },
+        "data": {
+            "format": "String",
+            "files": ["String"],
+            "separator": "String"
+        }
+    }
+
+-  ``column`` (String, required): The column that contains the raw
+   feature values in the dataset
+-  ``transformation`` (JSON object, optional): The type of
+   transformation that will be applied to the feature. For details on
+   the individual transformations supported see :ref:`supported-transformations`.
+   If this key is missing, the feature is treated as
+   a **no-op** feature without ``kwargs``.
+
+   -  ``name`` (String, required): The name of the transformation to be
+      applied.
+   -  ``kwargs`` (JSON object, optional): A dictionary of parameter
+      names and values. Each individual transformation will have its own
+      supported parameters, described in :ref:`supported-transformations`.
+
+-  ``name`` (String, optional): The name that will be given to the
+   encoded feature. If not given, **column** is used as the output name.
+-  ``data`` (JSON object, optional): If the data for the feature
+   exist in a file source that's different from the rest of the data of
+   the node/edge type, they are provided here. For example, you could
+   have each feature in one file source each:
+
+   .. code-block:: python
+
+        # Example node config with multiple features
+        {
+            # This is where the node structure data exist just need an id col
+            "data": {
+                "format": "parquet",
+                "files": ["path/to/node_ids"]
+            },
+            "column" : "node_id",
+            "type" : "my_node_type",
+            "features": [
+                # Feature 1
+                {
+                    "column": "feature_one",
+                    # The files contain one "node_id" col and one "feature_one" col
+                    "data": {
+                        "format": "parquet",
+                        "files": ["path/to/feature_one/"]
+                    }
+                },
+                # Feature 2
+                {
+                    "column": "feature_two",
+                    # The files contain one "node_id" col and one "feature_two" col
+                    "data": {
+                        "format": "parquet",
+                        "files": ["path/to/feature_two/"]
+                    }
+                }
+            ]
+        }
+
+
+   **The file source needs
+   to contain the column names of the parent node/edge type to allow a
+   1-1 mapping between the structure and feature files.**
+
+   For nodes the
+   the feature files need to have one column named with the node id column
+   name, (the value of ``"column"`` for the parent node type),
+   for edges we need both the ``source`` and
+   ``destination`` columns to use as a composite key.
+
+.. _supported-transformations:
+
+Supported transformations
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In this section we'll describe the transformations we support.
+The name of the transformation is the value that would appear
+in the ``transform['name']`` element of the feature configuration,
+with the attached ``kwargs`` for the transformations that support
+arguments.
+
+-  ``no-op``
+
+   -  Passes along the data as-is to be written to storage and
+      used in the partitioning pipeline. The data are assumed to be single
+      values or vectors of floats.
+   -  ``kwargs``:
+
+      -  ``separator`` (String, optional): Only relevant for CSV file
+         sources, when a separator is used to encode vector feature
+         values into one column. If given, the separator will be used to
+         split the values in the column and create a vector column
+         output. Example: for a separator ``'|'`` the CSV value
+         ``1|2|3`` would be transformed to a vector, ``[1, 2, 3]``.
+
+--------------
+
+Examples
+~~~~~~~~
+
+OAG-Paper dataset
+-----------------
+
+.. code-block:: json
+
+    {
+        "version" : "gsprocessing-v1.0",
+        "graph" : {
+            "edges" : [
+                {
+                "data": {
+                    "format": "csv",
+                    "files": [
+                        "edges.csv"
+                    ],
+                    "separator": ","
+                },
+                "source": {"column": "~from", "type": "paper"},
+                "dest": {"column": "~to", "type": "paper"},
+                "relation": {"type": "cites"}
+                }
+            ],
+            "nodes" : [
+                {
+                    "data": {
+                        "format": "csv",
+                        "separator": ",",
+                        "files": [
+                            "node_feat.csv"
+                        ]
+                    },
+                    "type": "paper",
+                    "column": "ID",
+                    "labels": [
+                        {
+                            "column": "field",
+                            "type": "classification",
+                            "separator": ";",
+                            "split_rate": {
+                                "train": 0.7,
+                                "val": 0.1,
+                                "test": 0.2
+                            }
+                        }
+                    ]
+                }
+            ]
+        }
+    }
diff --git a/graphstorm-processing/docs/source/index.rst b/graphstorm-processing/docs/source/index.rst
new file mode 100644
index 0000000000..cc027cbb08
--- /dev/null
+++ b/graphstorm-processing/docs/source/index.rst
@@ -0,0 +1,154 @@
+.. graphstorm-processing documentation master file, created by
+   sphinx-quickstart on Tue Aug  1 02:04:45 2023.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to GraphStorm Distributed Data Processing documentation!
+=================================================
+
+.. toctree::
+    :maxdepth: 1
+    :caption: Contents:
+
+    Example <usage/example>
+    Distributed processing setup <usage/distributed-processing-setup>
+    Running on Amazon Sagemaker <usage/amazon-sagemaker>
+    Developer Guide <developer/developer-guide>
+    Input configuration <developer/input-configuration>
+
+
+GraphStorm Distributed Data Processing allows you to process and prepare massive graph data
+for training with GraphStorm. GraphStorm Processing takes care of generating
+unique ids for nodes, using them to encode edge structure files, process
+individual features and prepare the data to be passed into the
+distributed partitioning and training pipeline of GraphStorm.
+
+We use PySpark to achieve
+horizontal parallelism, allowing us to scale to graphs with billions of nodes
+and edges.
+
+.. _installation-ref:
+
+Installation
+------------
+
+The project uses Python 3.9. We recommend using `PyEnv <https://github.com/pyenv/pyenv>`_
+to have isolated Python installations.
+
+With PyEnv installed you can create and activate a Python 3.9 environment using
+
+.. code-block:: bash
+
+    pyenv install 3.9
+    pyenv local 3.9
+
+
+With a recent version of ``pip`` installed (we recommend ``pip>=21.3``), you can simply run ``pip install .``
+from the root directory of the project (``graphstorm/graphstorm-processing``),
+which should install the library into your environment and pull in all dependencies.
+
+Install Java 8, 11, or 17
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Spark has a runtime dependency on the JVM to run, so you'll need to ensure
+Java is installed and available on your system.
+
+On MacOS you can install Java using ``brew``:
+
+.. code-block:: bash
+
+    brew install openjdk@11
+
+On Linux it will depend on your distribution's package
+manager. For Ubuntu you can use:
+
+.. code-block:: bash
+
+    sudo apt install openjdk-11-jdk
+
+On Amazon Linux 2 you can use:
+
+.. code-block:: bash
+
+    sudo yum install java-11-amazon-corretto-headless
+    sudo yum install java-11-amazon-corretto-devel
+
+To check if Java is installed you can use.
+
+.. code-block:: bash
+
+    java -version
+
+
+Example
+-------
+
+See the provided :doc:`usage/example` for an example of how to start with tabular
+data and convert them into a graph representation before partitioning and
+training with GraphStorm.
+
+Usage
+-----
+
+To use the library to process your data, you will need to have your data
+in a tabular format, and a corresponding JSON configuration file that describes the
+data. The input data can be in CSV (with header(s)) or Parquet format.
+
+The configuration file can be in GraphStorm's GConstruct format,
+with the caveat that the file paths need to be relative to the
+location of the config file. See :doc:`/usage/example` for more details.
+
+After installing the library, executing a processing job locally can be done using:
+
+.. code-block:: bash
+
+    gs-processing \
+        --config-filename gconstruct-config.json \
+        --input-prefix /path/to/input/data \
+        --output-prefix /path/to/output/data
+
+Once the processing engine has processed the data, we want to ensure
+they match the requirements of the DGL distributed partitioning
+pipeline, so we need to run an additional script that will
+make sure the produced data matches the assumptions of DGL [#f1]_.
+
+.. note::
+
+    Ensure you pass the output path of the previous step as the input path here.
+
+.. code-block:: bash
+
+    gs-repartition \
+        --input-prefix /path/to/output/data
+
+Once this script completes, the data are ready to be fed into DGL's distributed
+partitioning pipeline.
+See `this guide <https://github.com/awslabs/graphstorm/blob/main/sagemaker/README.md#launch-graph-partitioning-task>`_
+for more details on how to use GraphStorm distributed partitioning on SageMaker.
+
+See :doc:`/usage/example` for a detailed walkthrough of using GSProcessing to
+wrangle data into a format that's ready to be consumed by the GraphStorm/DGL
+partitioning pipeline.
+
+
+Using with Amazon SageMaker
+---------------------------
+
+To run distributed jobs on Amazon SageMaker we will have to build a Docker image
+and push it to the Amazon Elastic Container Registry, which we cover in
+:doc:`usage/distributed-processing-setup` and run a SageMaker Processing
+job which we describe in :doc:`/usage/amazon-sagemaker`.
+
+
+Developer guide
+---------------
+
+To get started with developing the package refer to :doc:`/developer/developer-guide`.
+
+
+.. rubric:: Footnotes
+
+.. [#f1] DGL expects that every file produced for a single node/edge type
+    has matching row counts, which is something that Spark cannot guarantee.
+    We use the re-partitioning script to fix this where needed in the produced
+    output.
\ No newline at end of file
diff --git a/graphstorm-processing/docs/source/usage/amazon-sagemaker.rst b/graphstorm-processing/docs/source/usage/amazon-sagemaker.rst
new file mode 100644
index 0000000000..53fe61c922
--- /dev/null
+++ b/graphstorm-processing/docs/source/usage/amazon-sagemaker.rst
@@ -0,0 +1,154 @@
+Running distributed jobs on Amazon SageMaker
+============================================
+
+Once the :doc:`distributed processing setup <distributed-processing-setup>` is complete, we can
+use the Amazon SageMaker launch scripts to launch distributed processing
+jobs that use AWS resources.
+
+To demonstrate the usage of GSProcessing on Amazon SageMaker, we will execute the same job we used in our local
+execution example, but this time use Amazon SageMaker to provide the compute resources instead of our
+local machine.
+
+Upload data to S3
+-----------------
+
+Amazon SageMaker uses S3 as its storage target, so before starting
+we'll need to upload our test data to S3. To do so you will need
+to have read/write access to an S3 bucket, and the requisite AWS credentials
+and permissions.
+
+We will use the AWS CLI to upload data so make sure it is
+`installed <https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html>`_
+and `configured <https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html>`_
+in you local environment.
+
+Assuming ``graphstorm/graphstorm-processing`` is our current working
+directory we can upload the test data to S3 using:
+
+.. code-block:: bash
+
+    MY_BUCKET="enter-your-bucket-name-here"
+    REGION="bucket-region" # e.g. us-west-2
+    aws --region ${REGION} s3 sync ./tests/resources/small_heterogeneous_graph/ \
+        "${MY_BUCKET}/gsprocessing-input"
+
+.. note::
+
+    Make sure you are uploading your data to a bucket
+    that was created in the same region as the ECR image
+    you pushed in :doc:`/usage/distributed-processing-setup`.
+
+
+Launch the GSProcessing job on Amazon SageMaker
+-----------------------------------------------
+
+Once the data are uploaded to S3, we can use the Python script
+``graphstorm-processing/scripts/run_distributed_processing.py``
+to run a GSProcessing job on Amazon SageMaker.
+
+For this example we'll use a SageMaker Spark cluster with 2 ``ml.t3.xlarge`` instances
+since this is a tiny dataset. Using SageMaker you'll be able to create clusters
+of up to 20 instances, allowing you to scale your processing to massive graphs,
+using larger instances like `ml.r5.24xlarge`.
+
+Since we're now executing on AWS, we'll need access to an execution role
+for SageMaker and the ECR image URI we created in :doc:`/usage/distributed-processing-setup`.
+For instructions on how to create an execution role for SageMaker
+see the `AWS SageMaker documentation <https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html#sagemaker-roles-create-execution-role>`_.
+
+Let's set up a small bash script that will run the parametrized processing
+job, followed by the re-partitioning job, both on SageMaker
+
+.. code-block:: bash
+
+    ACCOUNT="enter-your-account-id-here" # e.g 1234567890
+    MY_BUCKET="enter-your-bucket-name-here"
+    SAGEMAKER_ROLE_NAME="enter-your-sagemaker-execution-role-name-here"
+    REGION="bucket-region" # e.g. us-west-2
+    DATASET_S3_PATH="s3://${MY_BUCKET}/gsprocessing-input"
+    OUTPUT_BUCKET=${MY_BUCKET}
+    DATASET_NAME="small-graph"
+    CONFIG_FILE="gconstruct-config.json"
+    INSTANCE_COUNT="2"
+    INSTANCE_TYPE="ml.t3.xlarge"
+    NUM_FILES="4"
+
+    IMAGE_URI="${ACCOUNT}.dkr.ecr.${REGION}.amazonaws.com/graphstorm-processing:0.1.0"
+    ROLE="arn:aws:iam::${ACCOUNT}:role/service-role/${SAGEMAKER_ROLE_NAME}"
+
+    OUTPUT_PREFIX="s3://${OUTPUT_BUCKET}/gsprocessing/${DATASET_NAME}/${INSTANCE_COUNT}x-${INSTANCE_TYPE}-${NUM_FILES}files/"
+
+    # Conditionally delete data at output
+    echo "Delete all data under output path? ${OUTPUT_PREFIX}"
+    select yn in "Yes" "No"; do
+        case $yn in
+            Yes ) aws s3 rm --recursive ${OUTPUT_PREFIX} --quiet; break;;
+            No ) break;;
+        esac
+    done
+
+    # This will run and block until the GSProcessing job is done
+    python scripts/run_distributed_processing.py \
+        --s3-input-prefix ${DATASET_S3_PATH} \
+        --s3-output-prefix ${OUTPUT_PREFIX} \
+        --role ${ROLE} \
+        --image ${IMAGE_URI} \
+        --region ${REGION} \
+        --config-filename ${CONFIG_FILE} \
+        --instance-count ${INSTANCE_COUNT} \
+        --instance-type ${INSTANCE_TYPE} \
+        --job-name "${DATASET_NAME}-${INSTANCE_COUNT}x-${INSTANCE_TYPE//./-}-${NUM_FILES}files" \
+        --num-output-files ${NUM_FILES} \
+        --wait-for-job
+
+    # This will run the follow-up re-partitioning job
+    python scripts/run_repartitioning.py --s3-input-prefix ${OUTPUT_PREFIX} \
+        --role ${ROLE} --image ${IMAGE_URI}  --config-filename "metadata.json" \
+        --instance-type ${INSTANCE_TYPE} --wait-for-job
+
+
+.. note::
+
+    The re-partitioning job runs on a single instance, so for large graphs you will
+    want to scale up to an instance with more memory to avoid memory errors. `ml.r5` instances
+    should allow you to re-partition graph data with billions of nodes and edges.
+
+The ``--num-output-files`` parameter
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You can see that we provided a parameter named
+``--num-output-files`` to ``run_distributed_processing.py``. This is an
+important parameter, as it provides a hint to set the parallelism for Spark.
+
+It can safely be skipped and let Spark decide the proper value based on the cluster's
+instance type and count. If setting it yourself a good value to use is
+``num_instances * num_cores_per_instance * 2``, which will ensure good
+utilization of the cluster resources.
+
+
+Examine the output
+------------------
+
+Once both jobs are finished we can examine the output created, which
+should match the output we saw when running the same jobs locally
+in :doc:`/usage/example`:
+
+
+.. code-block:: bash
+
+    $ aws s3 ls ${OUTPUT_PREFIX}
+
+                               PRE edges/
+                               PRE node_data/
+                               PRE node_id_mappings/
+    2023-08-05 00:47:36        804 launch_arguments.json
+    2023-08-05 00:47:36      11914 metadata.json
+    2023-08-05 00:47:37        545 perf_counters.json
+    2023-08-05 00:47:37      12082 updated_row_counts_metadata.json
+
+Run distributed partitioning and training on Amazon SageMaker
+-------------------------------------------------------------
+
+With the data now processed you can follow the
+`GraphStorm Amazon SageMaker guide <https://github.com/awslabs/graphstorm/tree/main/sagemaker#launch-graph-partitioning-task>`_
+to partition your data and run training on AWS.
diff --git a/graphstorm-processing/docs/source/usage/distributed-processing-setup.rst b/graphstorm-processing/docs/source/usage/distributed-processing-setup.rst
new file mode 100644
index 0000000000..785dd5a514
--- /dev/null
+++ b/graphstorm-processing/docs/source/usage/distributed-processing-setup.rst
@@ -0,0 +1,136 @@
+Distributed Processing setup for Amazon SageMaker
+=================================================
+
+In this guide we'll demonstrate how to prepare your environment to run
+GraphStorm Processing (GSP) jobs on Amazon SageMaker.
+
+We're assuming a Linux host environment used throughout
+this tutorial, but other OS should work fine as well.
+
+The steps required are:
+
+- Clone the GraphStorm repository.
+- Install Docker.
+- Install Poetry.
+- Set up AWS access.
+- Build the GraphStorm Processing image using Docker.
+- Push the image to the Amazon Elastic Container Registry (ECR).
+- Launch a SageMaker Processing job using the example scripts.
+
+Clone the GraphStorm repository
+-------------------------------
+
+You can clone the GraphStorm repository using
+
+.. code-block:: bash
+
+    git clone https://github.com/awslabs/graphstorm.git
+
+You can then navigate to the ``graphstorm-processing/docker`` directory
+that contains the relevant code:
+
+.. code-block:: bash
+
+    cd ./graphstorm/graphstorm-processing/docker
+
+Install Docker
+--------------
+
+To get started with building the GraphStorm Processing image
+you'll need to have the Docker engine installed.
+
+
+To install Docker follow the instructions at the
+`official site <https://docs.docker.com/engine/install/>`_.
+
+Install Poetry
+--------------
+
+We use `Poetry <https://python-poetry.org/docs/>`_ as our build
+tool and for dependency management,
+so we need to install it to facilitate building the library.
+
+You can install Poetry using:
+
+.. code-block:: bash
+
+    curl -sSL https://install.python-poetry.org | python3 -
+
+For detailed installation instructions the
+`Poetry docs <https://python-poetry.org/docs/>`_.
+
+
+Set up AWS access
+-----------------
+
+To build and push the image to ECR we'll make use of the
+``aws-cli`` and we'll need valid AWS credentials as well.
+
+To install the AWS CLI you can use:
+
+.. code-block:: bash
+
+    curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
+    unzip awscliv2.zip
+    sudo ./aws/install
+
+To set up credentials for use with ``aws-cli`` see the
+`AWS docs <https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html#cli-configure-files-examples>`_.
+
+Your role should have full ECR access to be able to pull from ECR to build the image,
+create an ECR repository if it doesn't exist, and push the GSProcessing image to the repository.
+
+Building the GraphStorm Processing image using Docker
+-----------------------------------------------------
+
+Once Docker and Poetry are installed, and your AWS credentials are set up,
+we can use the provided scripts
+in the ``graphstorm-processing/docker`` directory to build the image.
+
+The ``build_gsprocessing_image.sh`` script can build the image
+locally and tag it. For example, assuming our current directory is where
+we cloned ``graphstorm/graphstorm-processing``:
+
+.. code-block:: bash
+
+    bash docker/build_gsprocessing_image.sh
+
+The above will use the Dockerfile of the latest available GSProcessing version,
+build an image and tag it as ``graphstorm-processing:${VERSION}`` where
+``${VERSION}`` will take be the latest available GSProcessing version (e.g. ``0.1.0``).
+
+The script also supports other arguments to customize the image name,
+tag and other aspects of the build. See ``bash docker/build_gsprocessing_image.sh --help``
+for more information.
+
+Push the image to the Amazon Elastic Container Registry (ECR)
+-------------------------------------------------------------
+
+Once the image is built we can use the ``push_gsprocessing_image.sh`` script
+that will create an ECR repository if needed and push the image we just built.
+
+The script does not require any arguments and by default will
+create a repository named ``graphstorm-processing`` in the ``us-west-2`` region,
+on the default AWS account ``aws-cli`` is configured for,
+and push the image tagged with the latest version of GSProcessing.
+
+The script supports 4 optional arguments:
+
+1. Image name/repository. (``-i/--image``) Default: ``graphstorm-processing``
+2. Image tag. Default: (``-v/--version``) ``<latest_library_version>`` e.g. ``0.1.0``.
+3. ECR region. Default: (``-r/--region``) ``us-west-2``.
+4. AWS Account ID. (``-a/--account``) Default: Uses the account ID detected by the ``aws-cli``.
+
+Example:
+
+.. code-block:: bash
+
+    bash push_gsprocessing_image.sh -i "graphstorm-processing" -v "0.1.0" -r "us-west-2" -a "1234567890"
+
+
+Launch a SageMaker Processing job using the example scripts.
+------------------------------------------------------------
+
+Once the setup is complete, you can follow the
+:doc:`SageMaker Processing job guide <amazon-sagemaker>`
+to launch your distributed processing job using AWS resources.
diff --git a/graphstorm-processing/docs/source/usage/example.rst b/graphstorm-processing/docs/source/usage/example.rst
new file mode 100644
index 0000000000..ab25b5a1f1
--- /dev/null
+++ b/graphstorm-processing/docs/source/usage/example.rst
@@ -0,0 +1,268 @@
+GraphStorm Processing example
+=============================
+
+To demonstrate how to use the library locally we will
+use the same example data as we use in our
+unit tests, which you can find in the project's repository,
+under ``graphstorm/graphstorm-processing/tests/resources/small_heterogeneous_graph``.
+
+Install example dependencies
+----------------------------
+
+To run the local example you will need to install the GSProcessing
+library to your Python environment, and you'll need to clone the
+GraphStorm repository to get access to the data.
+
+Follow the :ref:`installation-ref` guide to install the GSProcessing library.
+
+You can clone the repository using
+
+.. code-block:: bash
+
+    git clone https://github.com/awslabs/graphstorm.git
+
+You can then navigate to the ``graphstorm-processing/`` directory
+that contains the relevant data:
+
+.. code-block:: bash
+
+    cd ./graphstorm/graphstorm-processing/
+
+
+Expected file inputs and configuration
+--------------------------------------
+
+GSProcessing expects the input files to be in specific format that will allow
+us to perform the processing and prepare the data for partitioning and training.
+
+The data files are expected to be:
+
+* Tabular data files. We support CSV-with-header format, or in Parquet format.
+  The files can be split (multiple parts), or a single file.
+* Available on a local file system or on S3.
+* One tabular file source per edge and node type. For example, for a particular edge
+  type, all node identifiers (source, destination), features, and labels should
+  exist as columns in a single file source.
+
+Apart from the data, GSProcessing also requires a configuration file that describes the
+data and the transformations we will need to apply to the features and any encoding needed for
+labels.
+We support both the `GConstruct configuration format <https://graphstorm.readthedocs.io/en/latest/configuration/configuration-gconstruction.html#configuration-json-explanations>`_
+, and the library's own GSProcessing format, described in :doc:`/developer/input-configuration`.
+
+.. note::
+    We expect end users to only provide a GConstruct configuration file,
+    and only use the configuration format of GSProcessing as an intermediate
+    layer to decouple the two projects.
+
+    Developers who are looking to use GSProcessing
+    as their backend processing engine can either use the GSProcessing configuration
+    format directly, or translate their own configuration format to GSProcessing,
+    as we do with GConstruct.
+
+For a detailed description of all the entries of the GSProcessing configuration file see
+:doc:`/developer/input-configuration`.
+
+Relative file paths required
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The one difference with single-instance GConstruct files,
+is that we require that the file paths listed in the configuration file are
+`relative to the location of the configuration file.` Specifically:
+
+* All file paths listed **must not** start with ``/``.
+* Assuming the configuration file is under ``$PATH``, and a filepath is listed as ``${FILEPATH}``
+  in the configuration file, the corresponding file is expected to exist at ``${PATH}/${FILEPATH}``.
+
+For example:
+
+.. code-block:: bash
+
+    > pwd
+    /home/path/to/data/ # This is the current working directory (cwd)
+    > ls
+    gconstruct-config.json edge_data # These are the files under the cwd
+    > ls edge_data/ # These are the files under the edge_data directory
+    movie-included_in-genre.csv
+
+The contents of the ``gconstruct-config.json`` can be:
+
+.. code-block:: python
+
+    {
+        "edges" : [
+            {
+                # Note that the file is a relative path
+                "files": ["edges/movie-included_in-genre.csv"],
+                "format": {
+                    "name": "csv",
+                    "separator" : ","
+                }
+                # [...] Other edge config values
+            }
+        ]
+    }
+
+Given the above we can run a job with local input data as:
+
+.. code-block:: bash
+
+    > gs-processing --input-data /home/path/to/data \
+        --config-filename gconstruct-config.json
+
+The benefit with using relative paths is that we can move the same files
+to any location, including S3, and run the same job without making changes to the config
+file:
+
+.. code-block:: bash
+
+    # Move all files to new directory
+    > mv /home/path/to/data /home/new-path/to/data
+    # After moving all the files we can still use the same config
+    > gs-processing --input-data /home/new-path/to/data \
+        --config-filename gconstruct-config.json
+
+    # Upload data to S3
+    > aws s3 sync /home/new-path/to/data s3://my-bucket/data/
+    # We can still use the same config, just change the prefix to an S3 path
+    > python run_distributed_processing.py --input-data s3://my-bucket/data \
+        --config-filename gconstruct-config.json
+
+Node files are optional
+^^^^^^^^^^^^^^^^^^^^^^^
+
+GSProcessing does not require node files to be provided for
+every node type. If a node type appears in one of the edges,
+its unique node identifiers will be determined by the edge files.
+
+In the example GConstruct file above (`gconstruct-config.json`), the node ids for the node types
+``movie`` and ``genre`` will be extracted from the edge list provided.
+
+Example data and configuration
+------------------------------
+
+For this example we use a small heterogeneous graph inspired by the Movielens dataset.
+You can see the configuration file under
+``graphstorm-processing/tests/resources/small_heterogeneous_graph/gconstruct-config.json``
+
+We have 4 node types, ``movie``, ``genre``, ``director``, and ``user``. The graph has 3
+edge types, ``movie:included_in:genre``, ``user:rated:movie``, and ``director:directed:movie``.
+
+We include one ``no-op`` feature, ``age``, that we directly pass to the output without any transformation,
+and one label, ``gender``, that we transform to prepare the data for a node classification task.
+
+
+Run a GSProcessing job locally
+------------------------------
+
+While GSProcessing is designed to run on distributed clusters,
+we can also run small jobs in a local environment, using a local Spark instance.
+
+To do so, we will be using the ``gs-processing`` entry point,
+to process the data and create the output on our local storage.
+
+We will provide an input and output prefix for our data, passing
+local paths to the script.
+
+We also provide the argument ``--num-output-files`` that instructs PySpark
+to try and create output with 4 partitions [#f1]_.
+
+Assuming our working directory is ``graphstorm/graphstorm-processing/``
+we can use the following command to run the processing job locally:
+
+.. code-block:: bash
+
+    gs-processing --config-filename gconstruct-config.json \
+        --input-prefix ./tests/resources/small_heterogeneous_graph \
+        --output-prefix /tmp/gsprocessing-example/ \
+        --num-output-files 4
+
+
+To finalize processing and to wrangle the data into the structure that
+DGL distributed partitioning expects, we need an additional step that
+guarantees the data conform to the expectations of DGL:
+
+.. code-block:: bash
+
+    gs-repartition --input-prefix /tmp/gsprocessing-example/
+
+
+Examining the job output
+------------------------
+
+Once the processing and re-partitioning jobs are done,
+we can examine the outputs they created. The output will be
+compatible with the `Chunked Graph Format of DistDGL <https://docs.dgl.ai/guide/distributed-preprocessing.html#chunked-graph-format>`_
+and can be used downstream to create a partitioned graph.
+
+.. code-block:: bash
+
+    $ cd /tmp/gsprocessing-example
+    $ ls
+
+    edges/  launch_arguments.json  metadata.json  node_data/
+    node_id_mappings/  perf_counters.json  updated_row_counts_metadata.json
+
+We have a few JSON files and the data directories containing
+the graph structure, features, and labels. In more detail:
+
+* ``launch_arguments.json``: Contains the arguments that were used
+  to launch the processing job, allowing you to check the parameters after the
+  job finishes.
+* ``updated_row_counts_metadata.json``:
+  This file is meant to be used as the input configuration for the
+  distributed partitioning pipeline. ``repartition_files.py`` produces
+  this file using the original ``metadata.json`` file as input.
+* ``metadata.json``: Created by ``gs-processing`` and used as input
+  for ``repartition_files.py``, can be removed once that script has run.
+* ``perf_counters.json``: A JSON file that contains runtime measurements
+  for the various components of GSProcessing. Can be used to profile the
+  application and discover bottlenecks.
+
+The directories created contain:
+
+* ``edges``: Contains the edge structures, one sub-directory per edge
+  type. Each edge file will contain two columns, the source and destination
+  `numerical` node id, named ``src_int_id`` and ``dist_int_id`` respectively.
+* ``node_data``: Contains the features for the nodes, one sub-directory
+  per node type. Each file will contain one column named after the original
+  feature name that contains the value of the feature (could be a scalar or a vector).
+* ``node_id_mappings``: Contains mappings from the original node ids to the
+  ones created by the processing job. This mapping would allow you to trace
+  back predictions to the original nodes/edges. The files will have two columns,
+  ``node_str_id`` that contains the original string ID of the node, and ``node_int_id``
+  that contains the numerical id that the string id was mapped to.
+
+If the graph had included edge features they would appear
+in an ``edge_data`` directory.
+
+.. note::
+
+    It's important to note that files for edges and edge data will have the
+    same order and row counts per file, as expected by DistDGL. Similarly,
+    all node feature files will have the same order and row counts, where
+    the first row corresponds to the feature value for node id 0, the second
+    for node id 1 etc.
+
+
+At this point you can use the DGL distributed partitioning pipeline
+to partition your data, as described in the
+`DGL documentation <https://docs.dgl.ai/guide/distributed-preprocessing.html#distributed-graph-partitioning-pipeline>`_
+
+To simplify the process of partitioning and training, without the need
+to manage your own infrastructure, we recommend using GraphStorm's
+`SageMaker wrappers <https://graphstorm.readthedocs.io/en/latest/scale/sagemaker.html>`_
+that do all the hard work for you and allow
+you to focus on model development.
+
+To run GSProcessing jobs on Amazon SageMaker we'll need to follow
+:doc:`/usage/distributed-processing-setup` to set up our environment
+and :doc:`/usage/amazon-sagemaker` to execute the job.
+
+
+.. rubric:: Footnotes
+
+
+.. [#f1] Note that this is just a hint to the Spark engine, and it's
+    not guaranteed that the number of output partitions will always match
+    the requested value.
\ No newline at end of file
diff --git a/graphstorm-processing/graphstorm_processing/distributed_executor.py b/graphstorm-processing/graphstorm_processing/distributed_executor.py
index ef2db20930..61451a5a5e 100644
--- a/graphstorm-processing/graphstorm_processing/distributed_executor.py
+++ b/graphstorm-processing/graphstorm_processing/distributed_executor.py
@@ -168,21 +168,31 @@ def __init__(
             dataset_config_dict: Dict[str, Any] = json.load(f)
 
         if "version" in dataset_config_dict:
-            self.config_version = dataset_config_dict["version"]
-            if self.config_version == "gsprocessing-v1.0":
+            config_version = dataset_config_dict["version"]
+            if config_version == "gsprocessing-v1.0":
                 logging.info("Parsing config file as GSProcessing config")
                 self.graph_config_dict = dataset_config_dict["graph"]
-            elif self.config_version == "gconstruct-v1.0":
+            elif config_version == "gconstruct-v1.0":
                 logging.info("Parsing config file as GConstruct config")
                 converter = GConstructConfigConverter()
                 self.graph_config_dict = converter.convert_to_gsprocessing(dataset_config_dict)[
                     "graph"
                 ]
             else:
-                logging.warning("Unrecognized version name: %s", self.config_version)
+                logging.warning("Unrecognized version name: %s", config_version)
+                try:
+                    converter = GConstructConfigConverter()
+                    self.graph_config_dict = converter.convert_to_gsprocessing(dataset_config_dict)[
+                        "graph"
+                    ]
+                except Exception:  # pylint: disable=broad-exception-caught
+                    logging.warning("Could not parse config as GConstruct, trying GSProcessing")
+                    assert (
+                        "graph" in dataset_config_dict
+                    ), "Top-level element 'graph' needs to exist in a GSProcessing config"
+                    self.graph_config_dict = dataset_config_dict["graph"]
         else:
             # Older versions of GConstruct configs might be missing a version entry
-            self.config_version = "gconstruct"
             converter = GConstructConfigConverter()
             self.graph_config_dict = converter.convert_to_gsprocessing(dataset_config_dict)["graph"]
 
diff --git a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py
index bb844d1ef0..93a5d08109 100644
--- a/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py
+++ b/graphstorm-processing/graphstorm_processing/graph_loaders/dist_heterogeneous_loader.py
@@ -315,7 +315,7 @@ def _initialize_metadata_dict(
             # Add original and reverse edge types
             edge_types.append(f"{src_type}:{rel_type}:{dst_type}")
             if self.add_reverse_edges:
-                edge_types.append(f"{dst_type}:rev-{rel_type}:{src_type}")
+                edge_types.append(f"{dst_type}:{rel_type}-rev:{src_type}")
 
         metadata_dict["edge_type"] = edge_types
         metadata_dict["node_type"] = sorted(node_type_set)
@@ -1072,7 +1072,7 @@ def write_edge_structure(
             f"{edge_config.src_ntype}:{edge_config.get_relation_name()}:{edge_config.dst_ntype}"
         )
         rev_edge_type = (
-            f"{edge_config.dst_ntype}:rev-{edge_config.get_relation_name()}:{edge_config.src_ntype}"
+            f"{edge_config.dst_ntype}:{edge_config.get_relation_name()}-rev:{edge_config.src_ntype}"
         )
 
         src_node_id_mapping = (
@@ -1223,7 +1223,7 @@ def process_edge_data(self, edge_configs: Sequence[EdgeConfig]) -> Tuple[Dict, D
             )
             reverse_edge_type = (
                 f"{edge_config.dst_ntype}"
-                f":rev-{edge_config.get_relation_name()}"
+                f":{edge_config.get_relation_name()}-rev"
                 f":{edge_config.src_ntype}"
             )
             logging.info("Processing edge type '%s'...", edge_type)
diff --git a/graphstorm-processing/graphstorm_processing/repartition_files.py b/graphstorm-processing/graphstorm_processing/repartition_files.py
index d0105267f0..1d13d34253 100644
--- a/graphstorm-processing/graphstorm_processing/repartition_files.py
+++ b/graphstorm-processing/graphstorm_processing/repartition_files.py
@@ -833,7 +833,7 @@ def main():
         for type_idx, (type_name, type_data_dict) in enumerate(edge_data_meta.items()):
             src, relation, dst = type_name.split(":")
 
-            if relation.startswith("rev-"):
+            if relation.endswith("-rev"):
                 # Reverse edge types do not have their own data,
                 # and if needed we re-partition their structure while
                 # handling the "regular" edge type.
@@ -845,7 +845,7 @@ def main():
                     type_name,
                 )
                 continue
-            reverse_edge_type_name = f"{dst}:rev-{relation}:{src}"
+            reverse_edge_type_name = f"{dst}:{relation}-rev:{src}"
             most_frequent_counts = list(edge_row_counts_frequencies[type_name].most_common(1)[0][0])
             repartitioner = ParquetRepartitioner(
                 input_prefix, filesystem_type, region, verify_outputs=True
diff --git a/graphstorm-processing/tests/resources/repartitioning/partitioned_metadata.json b/graphstorm-processing/tests/resources/repartitioning/partitioned_metadata.json
index 3ddaa0e2ca..bbdf13eb03 100644
--- a/graphstorm-processing/tests/resources/repartitioning/partitioned_metadata.json
+++ b/graphstorm-processing/tests/resources/repartitioning/partitioned_metadata.json
@@ -1,7 +1,7 @@
 {
     "edge_type": [
         "src:dummy_type:dst",
-        "dst:rev-dummy_type:src"
+        "dst:dummy_type-rev:src"
     ],
     "edges": {
         "src:dummy_type:dst": {
@@ -18,7 +18,7 @@
                 "delimiter": ""
             }
         },
-        "dst:rev-dummy_type:src": {
+        "dst:dummy_type-rev:src": {
             "data": [
                 "edges/dummy_type/parquet/part-00000.parquet",
                 "edges/dummy_type/parquet/part-00001.parquet",
@@ -78,7 +78,7 @@
                 }
             }
         },
-        "dst:rev-dummy_type:src": {
+        "dst:dummy_type-rev:src": {
                 "label": {
                     "data":[
                         "edge_data/dummy_type-label/parquet/part-00000.parquet",
diff --git a/graphstorm-processing/tests/test_dist_heterogenous_loader.py b/graphstorm-processing/tests/test_dist_heterogenous_loader.py
index c005a6527d..13e5038f9f 100644
--- a/graphstorm-processing/tests/test_dist_heterogenous_loader.py
+++ b/graphstorm-processing/tests/test_dist_heterogenous_loader.py
@@ -165,11 +165,11 @@ def verify_integ_test_output(
     assert metadata["node_type"] == ["director", "genre", "movie", "user"]
     assert metadata["edge_type"] == [
         "movie:included_in:genre",
-        "genre:rev-included_in:movie",
+        "genre:included_in-rev:movie",
         "user:rated:movie",
-        "movie:rev-rated:user",
+        "movie:rated-rev:user",
         "director:directed:movie",
-        "movie:rev-directed:director",
+        "movie:directed-rev:director",
     ]
 
     expected_node_counts = {"director": 3, "genre": 2, "movie": 4, "user": 5}
@@ -182,11 +182,11 @@ def verify_integ_test_output(
 
     expected_edge_counts = {
         "movie:included_in:genre": 4,
-        "genre:rev-included_in:movie": 4,
+        "genre:included_in-rev:movie": 4,
         "user:rated:movie": 6,
-        "movie:rev-rated:user": 6,
+        "movie:rated-rev:user": 6,
         "director:directed:movie": 4,
-        "movie:rev-directed:director": 4,
+        "movie:directed-rev:director": 4,
     }
 
     for edge_type in metadata["edge_type"]:
@@ -266,11 +266,11 @@ def test_load_dist_hgl_without_labels(dghl_loader_no_label: DistHeterogeneousGra
         "task_type": "link_predict",
         "etype_label": [
             "movie:included_in:genre",
-            "genre:rev-included_in:movie",
+            "genre:included_in-rev:movie",
             "user:rated:movie",
-            "movie:rev-rated:user",
+            "movie:rated-rev:user",
             "director:directed:movie",
-            "movie:rev-directed:director",
+            "movie:directed-rev:director",
         ],
         "etype_label_property": [],
         "ntype_label": [],
@@ -283,18 +283,18 @@ def test_load_dist_hgl_without_labels(dghl_loader_no_label: DistHeterogeneousGra
 
     expected_edge_data = {
         "user:rated:movie": {"train_mask", "val_mask", "test_mask"},
-        "movie:rev-rated:user": {"train_mask", "val_mask", "test_mask"},
+        "movie:rated-rev:user": {"train_mask", "val_mask", "test_mask"},
         "movie:included_in:genre": {"train_mask", "val_mask", "test_mask"},
-        "genre:rev-included_in:movie": {"train_mask", "val_mask", "test_mask"},
+        "genre:included_in-rev:movie": {"train_mask", "val_mask", "test_mask"},
         "director:directed:movie": {"train_mask", "val_mask", "test_mask"},
-        "movie:rev-directed:director": {"train_mask", "val_mask", "test_mask"},
+        "movie:directed-rev:director": {"train_mask", "val_mask", "test_mask"},
     }
 
     for edge_type in metadata["edge_data"]:
         assert metadata["edge_data"][edge_type].keys() == expected_edge_data[edge_type]
-        if not "rev-" in edge_type:
+        if not "-rev" in edge_type:
             src_type, relation, dst_type = edge_type.split(":")
-            rev_type = f"{dst_type}:rev-{relation}:{src_type}"
+            rev_type = f"{dst_type}:{relation}-rev:{src_type}"
             assert (
                 metadata["edge_data"][rev_type]["train_mask"]
                 == metadata["edge_data"][edge_type]["train_mask"]
diff --git a/graphstorm-processing/tests/test_repartition_files.py b/graphstorm-processing/tests/test_repartition_files.py
index e2a08968db..eb9dc45bba 100644
--- a/graphstorm-processing/tests/test_repartition_files.py
+++ b/graphstorm-processing/tests/test_repartition_files.py
@@ -217,8 +217,8 @@ def test_verify_metadata_only_edge_data():
     row_counts = [10, 10, 10, 10, 10]
     original_metadata_dict["edge_data"]["src:dummy_type:dst"]["label"]["row_counts"] = row_counts
     original_metadata_dict["edges"]["src:dummy_type:dst"]["row_counts"] = row_counts
-    original_metadata_dict["edges"].pop("dst:rev-dummy_type:src")
-    original_metadata_dict["edge_data"].pop("dst:rev-dummy_type:src")
+    original_metadata_dict["edges"].pop("dst:dummy_type-rev:src")
+    original_metadata_dict["edge_data"].pop("dst:dummy_type-rev:src")
 
     # Ensure success when counts match
     repartition_files.verify_metadata(
diff --git a/python/graphstorm/config/argument.py b/python/graphstorm/config/argument.py
index 0a39bd61e0..743ce237c6 100644
--- a/python/graphstorm/config/argument.py
+++ b/python/graphstorm/config/argument.py
@@ -132,9 +132,11 @@ def __init__(self, cmd_args):
                 if hasattr(cmd_args, "logging_level") else logging.INFO
         log_file = cmd_args.logging_file if hasattr(cmd_args, "logging_file") else None
         if log_file is None:
-            logging.basicConfig(level=log_level)
+            # We need to force the logging to reset the existing logging handlers
+            # in order to make sure this config is effective.
+            logging.basicConfig(level=log_level, force=True)
         else:
-            logging.basicConfig(filename=log_file, level=log_level)
+            logging.basicConfig(filename=log_file, level=log_level, force=True)
 
         self.yaml_paths = cmd_args.yaml_config_file
         # Load all arguments from yaml config
@@ -293,6 +295,7 @@ def verify_arguments(self, is_train):
         _ = self.restore_model_path
         _ = self.restore_optimizer_path
         _ = self.save_embed_path
+        _ = self.save_embed_format
 
         # Model architecture
         _ = self.dropout
@@ -853,6 +856,19 @@ def save_embed_path(self):
             return self._save_embed_path
         return None
 
+    @property
+    def save_embed_format(self):
+        """ Specify the format of saved embeddings.
+        """
+        # pylint: disable=no-member
+        if hasattr(self, "_save_embed_format"):
+            assert self._save_embed_format in ["pytorch", "hdf5"], \
+                f"{self._save_embed_format} is not supported for save_embed_format." \
+                f"Supported format ['pytorch', 'hdf5']."
+            return self._save_embed_format
+        # default to be 'pytorch'
+        return "pytorch"
+
     @property
     def save_model_path(self):
         """ Path to save the model.
@@ -2012,6 +2028,8 @@ def _add_output_args(parser):
     group.add_argument("--save-embed-path", type=str, default=argparse.SUPPRESS,
             help="Save the embddings in the specified directory. "
                  "Use none to turn off embedding saveing")
+    group.add_argument("--save-embed-format", type=str, default=argparse.SUPPRESS,
+            help="Specify the format for saved embeddings. Valid format: ['pytorch', 'hdf5']")
     group.add_argument('--save-model-frequency', type=int, default=argparse.SUPPRESS,
             help='Save the model every N iterations.')
     group.add_argument('--save-model-path', type=str, default=argparse.SUPPRESS,
diff --git a/python/graphstorm/data/utils.py b/python/graphstorm/data/utils.py
index 0f670ce15b..dbd82a518a 100644
--- a/python/graphstorm/data/utils.py
+++ b/python/graphstorm/data/utils.py
@@ -236,6 +236,19 @@ def alltoall_cpu(rank, world_size, output_tensor_list, input_tensor_list):
     for i in range(world_size):
         dist.scatter(output_tensor_list[i], input_tensor_list if i == rank else None, src=i)
 
+def alltoallv_nccl(output_tensor_list, input_tensor_list):
+    """ Each process scatters list of input tensors to all processes in a cluster
+        and return gathered list of tensors in output list using nccl backend.
+
+    Parameters
+    ----------
+    output_tensor_list : List of tensor
+        The received tensors
+    input_tensor_list : List of tensor
+        The tensors to exchange
+    """
+    th.distributed.all_to_all(output_tensor_list, input_tensor_list)
+
 def alltoallv_cpu(rank, world_size, output_tensor_list, input_tensor_list):
     """Each process scatters list of input tensors to all processes in a cluster
     and return gathered list of tensors in output list.
diff --git a/python/graphstorm/dataloading/dataloading.py b/python/graphstorm/dataloading/dataloading.py
index e155f5f063..fdb6ffc0ac 100644
--- a/python/graphstorm/dataloading/dataloading.py
+++ b/python/graphstorm/dataloading/dataloading.py
@@ -158,6 +158,10 @@ def fanout(self):
 class GSgnnEdgeDataLoader(GSgnnEdgeDataLoaderBase):
     """ The minibatch dataloader for edge prediction
 
+    GSgnnEdgeDataLoader samples GraphStorm edge dataset into an iterable over mini-batches 
+    of samples. Both source and destination nodes are included in the batch_graph, which
+    will be used by GraphStorm Trainers and Inferrers.
+
     Parameters
     ------------
     dataset: GSgnnEdgeData
@@ -182,6 +186,23 @@ class GSgnnEdgeDataLoader(GSgnnEdgeDataLoaderBase):
         The node types that requires to construct node features.
     construct_feat_fanout : int
         The fanout required to construct node features.
+        
+    Examples
+    ------------
+    To train a 2-layer GNN for edge prediction on a set of edges ``target_idx`` on 
+    a graph where each nodes takes messages from 15 neighbors on the first layer
+    and 10 neighbors on the second. 
+
+    .. code:: python
+
+        from graphstorm.dataloading import GSgnnEdgeTrainData
+        from graphstorm.dataloading import GSgnnEdgeDataLoader
+        from graphstorm.trainer import GSgnnEdgePredictionTrainer
+
+        ep_data = GSgnnEdgeTrainData(...)
+        ep_dataloader = GSgnnEdgeDataLoader(ep_data, target_idx, fanout=[15, 10], batch_size=128)
+        ep_trainer = GSgnnEdgePredictionTrainer(...)
+        ep_trainer.fit(ep_dataloader, 10)
     """
     def __init__(self, dataset, target_idx, fanout, batch_size, device='cpu',
                  train_task=True, reverse_edge_types_map=None,
@@ -357,7 +378,11 @@ def target_eidx(self):
 class GSgnnLinkPredictionDataLoader(GSgnnLinkPredictionDataLoaderBase):
     """ Link prediction minibatch dataloader
 
-    The negative edges are sampled uniformly.
+    GSgnnLinkPredictionDataLoader samples GraphStorm edge dataset into an iterable over mini-batches 
+    of samples. In each batch, pos_graph and neg_graph are sampled subgraph for positive and 
+    negative edges, which will be used by GraphStorm Trainers and Inferrers. Given a positive edge, 
+    a negative edge is composed of the source node and a random negative destination nodes 
+    according to a uniform distribution.
 
     Argument
     --------
@@ -387,6 +412,24 @@ class GSgnnLinkPredictionDataLoader(GSgnnLinkPredictionDataLoaderBase):
         The node types that requires to construct node features.
     construct_feat_fanout : int
         The fanout required to construct node features.
+    
+    Examples
+    ------------
+    To train a 2-layer GNN for link prediction on a set of positive edges ``target_idx`` on 
+    a graph where each nodes takes messages from 15 neighbors on the first layer
+    and 10 neighbors on the second. We use 10 negative edges per positive in this example.
+
+    .. code:: python
+
+        from graphstorm.dataloading import GSgnnEdgeTrainData
+        from graphstorm.dataloading import GSgnnLinkPredictionDataLoader
+        from graphstorm.trainer import GSgnnLinkPredictionTrainer
+
+        lp_data = GSgnnEdgeTrainData(...)
+        lp_dataloader = GSgnnLinkPredictionDataLoader(lp_data, target_idx, fanout=[15, 10], 
+        num_negative_edges=10, batch_size=128)
+        lp_trainer = GSgnnLinkPredictionTrainer(...)
+        lp_trainer.fit(lp_dataloader, 10)
     """
     def __init__(self, dataset, target_idx, fanout, batch_size, num_negative_edges, device='cpu',
                  train_task=True, reverse_edge_types_map=None, exclude_training_targets=False,
@@ -961,6 +1004,10 @@ def fanout(self):
 
 class GSgnnNodeDataLoader(GSgnnNodeDataLoaderBase):
     """ Minibatch dataloader for node tasks
+    
+    GSgnnNodeDataLoader samples GraphStorm node dataset into an iterable over mini-batches of
+    samples including target nodes and sampled neighbor nodes, which will be used by GraphStorm 
+    Trainers and Inferrers. 
 
     Parameters
     ----------
@@ -980,6 +1027,23 @@ class GSgnnNodeDataLoader(GSgnnNodeDataLoaderBase):
         The node types that requires to construct node features.
     construct_feat_fanout : int
         The fanout required to construct node features.
+    
+    Examples
+    ----------
+    To train a 2-layer GNN for node classification on a set of nodes ``target_idx`` on 
+    a graph where each nodes takes messages from 15 neighbors on the first layer
+    and 10 neighbors on the second. 
+
+    .. code:: python
+    
+        from graphstorm.dataloading import GSgnnNodeTrainData
+        from graphstorm.dataloading import GSgnnNodeDataLoader
+        from graphstorm.trainer import GSgnnNodePredictionTrainer
+
+        np_data = GSgnnNodeTrainData(...)
+        np_dataloader = GSgnnNodeDataLoader(np_data, target_idx, fanout=[15, 10], batch_size=128)
+        np_trainer = GSgnnNodePredictionTrainer(...)
+        np_trainer.fit(np_dataloader, 10)
     """
     def __init__(self, dataset, target_idx, fanout, batch_size, device, train_task=True,
                  construct_feat_ntype=None, construct_feat_fanout=5):
diff --git a/python/graphstorm/dataloading/dataset.py b/python/graphstorm/dataloading/dataset.py
index 97441eb4fa..b06d6a398c 100644
--- a/python/graphstorm/dataloading/dataset.py
+++ b/python/graphstorm/dataloading/dataset.py
@@ -139,6 +139,13 @@ def __init__(self, graph_name, part_config, node_feat_field, edge_feat_field):
         self._val_idxs = {}
         self._test_idxs = {}
 
+        if get_rank() == 0:
+            g = self._g
+            for ntype in g.ntypes:
+                logging.debug("%s has %d nodes.", ntype, g.number_of_nodes(ntype))
+            for etype in g.canonical_etypes:
+                logging.debug("%s has %d edges.", str(etype), g.number_of_edges(etype))
+
         # Use wholegraph for feature transfer
         if is_distributed() and use_wholegraph(part_config):
             logging.info("Allocate features with Wholegraph")
@@ -275,7 +282,7 @@ def get_node_feats(self, input_nodes, device='cpu'):
                                    feat_field=self._node_feat_field)
 
     def get_edge_feats(self, input_edges, edge_feat_field, device='cpu'):
-        """ Get the node features
+        """ Get the edge features
 
         Parameters
         ----------
@@ -382,7 +389,9 @@ def test_idxs(self):
         return self._test_idxs
 
 class GSgnnEdgeTrainData(GSgnnEdgeData):
-    """ Edge prediction training data
+    r""" Edge prediction training data
+
+    The GSgnnEdgeTrainData prepares the data for training edge prediction.
 
     Parameters
     ----------
@@ -404,6 +413,17 @@ class GSgnnEdgeTrainData(GSgnnEdgeData):
         different feature names.
     decoder_edge_feat: str or dict of list of str
         Edge features used by decoder
+    
+    Examples
+    ----------
+    .. code:: python
+
+        from graphstorm.dataloading import GSgnnEdgeTrainData
+        from graphstorm.dataloading import GSgnnEdgeDataLoader
+        ep_data = GSgnnEdgeTrainData(graph_name='dummy', part_config=part_config,
+                                        train_etypes=[('n1', 'e1', 'n2')], label_field='label',
+                                        node_feat_field='node_feat', edge_feat_field='edge_feat')
+        ep_dataloader = GSgnnEdgeDataLoader(ep_data, target_idx={"e1":[0]}, fanout=[15, 10], batch_size=128)
     """
     def __init__(self, graph_name, part_config, train_etypes, eval_etypes=None,
                  label_field=None, node_feat_field=None, edge_feat_field=None,
@@ -543,7 +563,9 @@ def pos_graph_feat_field(self):
         return self._pos_graph_feat_field
 
 class GSgnnEdgeInferData(GSgnnEdgeData):
-    """ Edge prediction inference data
+    r""" Edge prediction inference data
+
+    GSgnnEdgeInferData prepares the data for edge prediction inference.
 
     Parameters
     ----------
@@ -563,6 +585,17 @@ class GSgnnEdgeInferData(GSgnnEdgeData):
         different feature names.
     decoder_edge_feat: str or dict of list of str
         Edge features used by decoder
+        
+    Examples
+    ----------
+    .. code:: python
+
+        from graphstorm.dataloading import GSgnnEdgeInferData
+        from graphstorm.dataloading import GSgnnEdgeDataLoader
+        ep_data = GSgnnEdgeInferData(graph_name='dummy', part_config=part_config,
+                                        eval_etypes=[('n1', 'e1', 'n2')], label_field='label',
+                                        node_feat_field='node_feat', edge_feat_field='edge_feat')
+        ep_dataloader = GSgnnEdgeDataLoader(ep_data, target_idx={"e1":[0]}, fanout=[15, 10], batch_size=128)
     """
     def __init__(self, graph_name, part_config, eval_etypes,
                  label_field=None, node_feat_field=None, edge_feat_field=None,
@@ -599,13 +632,14 @@ def prepare_data(self, g):
         for canonical_etype in self.eval_etypes:
             if 'test_mask' in g.edges[canonical_etype].data:
                 # test_mask exists
-                # we will do evaluation.
+                # we will do evaluation or inference on test data.
                 test_idx = dgl.distributed.edge_split(
                     g.edges[canonical_etype].data['test_mask'],
                     pb, etype=canonical_etype, force_even=True)
                 # If there are test data globally, we should add them to the dict.
                 if test_idx is not None and dist_sum(len(test_idx)) > 0:
                     test_idxs[canonical_etype] = test_idx
+                    infer_idxs[canonical_etype] = test_idx
             else:
                 # Inference only
                 # we will do inference on the entire edge set
@@ -706,7 +740,9 @@ def test_idxs(self):
         return self._test_idxs
 
 class GSgnnNodeTrainData(GSgnnNodeData):
-    """ Training data for node tasks
+    r""" Training data for node tasks
+
+    GSgnnNodeTrainData prepares the data for training node prediction.
 
     Parameters
     ----------
@@ -726,6 +762,18 @@ class GSgnnNodeTrainData(GSgnnNodeData):
     edge_feat_field : str or dict of list of str
         The field of the edge features. It's a dict if different edge types have
         different feature names.
+    
+    Examples
+    ----------
+    .. code:: python
+
+        from graphstorm.dataloading import GSgnnNodeTrainData
+        from graphstorm.dataloading import GSgnnNodeDataLoader
+
+        np_data = GSgnnNodeTrainData(graph_name='dummy', part_config=part_config,
+                                        train_ntypes=['n1'], label_field='label',
+                                        node_feat_field='feat')
+        np_dataloader = GSgnnNodeDataLoader(np_data, target_idx={'n1':[0]}, fanout=[15, 10], batch_size=128)
     """
     def __init__(self, graph_name, part_config, train_ntypes, eval_ntypes=None,
                  label_field=None, node_feat_field=None, edge_feat_field=None):
@@ -833,7 +881,9 @@ def eval_ntypes(self):
         return self._eval_ntypes
 
 class GSgnnNodeInferData(GSgnnNodeData):
-    """ Inference data for node tasks
+    r""" Inference data for node tasks
+
+    GSgnnNodeInferData prepares the data for node prediction inference.
 
     Parameters
     ----------
@@ -851,6 +901,18 @@ class GSgnnNodeInferData(GSgnnNodeData):
     edge_feat_field : str or dict of list of str
         The field of the edge features. It's a dict if different edge types have
         different feature names.
+    
+    Examples
+    ----------
+    .. code:: python
+
+        from graphstorm.dataloading import GSgnnNodeInferData
+        from graphstorm.dataloading import 
+
+        np_data = GSgnnNodeInferData(graph_name='dummy', part_config=part_config,
+                                        eval_ntypes=['n1'], label_field='label',
+                                        node_feat_field='feat')
+        np_dataloader = GSgnnNodeDataLoader(np_data, target_idx={'n1':[0]}, fanout=[15, 10], batch_size=128)
     """
     def __init__(self, graph_name, part_config, eval_ntypes,
                  label_field=None, node_feat_field=None, edge_feat_field=None):
@@ -885,13 +947,14 @@ def prepare_data(self, g):
                 if 'trainer_id' in g.nodes[ntype].data else None
             if 'test_mask' in g.nodes[ntype].data:
                 # test_mask exists
-                # we will do evaluation.
+                # we will do evaluation or inference on test data.
                 test_idx = dgl.distributed.node_split(g.nodes[ntype].data['test_mask'],
                                                       pb, ntype=ntype, force_even=True,
                                                       node_trainer_ids=node_trainer_ids)
                 # If there are test data globally, we should add them to the dict.
                 if test_idx is not None and dist_sum(len(test_idx)) > 0:
                     test_idxs[ntype] = test_idx
+                    infer_idxs[ntype] = test_idx
                 elif test_idx is None:
                     logging.warning("%s does not contains test data, skip testing %s",
                                     ntype, ntype)
diff --git a/python/graphstorm/eval/utils.py b/python/graphstorm/eval/utils.py
index 60febc1689..847919171b 100644
--- a/python/graphstorm/eval/utils.py
+++ b/python/graphstorm/eval/utils.py
@@ -18,7 +18,7 @@
 import torch as th
 
 from ..utils import get_backend, is_distributed
-from ..data.utils import alltoallv_cpu
+from ..data.utils import alltoallv_cpu, alltoallv_nccl
 
 def calc_distmult_pos_score(h_emb, t_emb, r_emb, device=None):
     """ Calculate DistMulti Score for positive pairs
@@ -321,7 +321,7 @@ def broadcast_data(rank, world_size, data_tensor):
     if get_backend() == "gloo":
         alltoallv_cpu(rank, world_size, gather_list, data_tensors)
     else: #get_backend() == "nccl"
-        th.distributed.all_to_all(gather_list, data_tensors)
+        alltoallv_nccl(gather_list, data_tensors)
 
     data_tensor = th.cat(gather_list, dim=0)
     return data_tensor
diff --git a/python/graphstorm/gconstruct/file_io.py b/python/graphstorm/gconstruct/file_io.py
index 0deaa15ef9..bc90045de3 100644
--- a/python/graphstorm/gconstruct/file_io.py
+++ b/python/graphstorm/gconstruct/file_io.py
@@ -266,15 +266,45 @@ def read_data_hdf5(data_file, data_fields=None, in_mem=True):
         data[name] = f[name][:] if in_mem else HDF5Array(f[name], handle)
     return data
 
+def stream_dist_tensors_to_hdf5(data, data_file, chunk_size=100000):
+    """ Stream write dict of dist tensor into a HDF5 file.
+
+    Parameters
+    ----------
+    data : dict of dist tensor
+        The data to be saved to the hdf5 file.
+    data_file : str
+        The file name of the hdf5 file.
+    chunk_size : int
+        The size of a chunk to extract from dist tensor.
+    """
+    chunk_size = 100000
+    with h5py.File(data_file, "w") as f:
+        for key, val in data.items():
+            arr = f.create_dataset(key, val.shape, dtype=np.array(val[0]).dtype)
+            if len(val) > chunk_size:
+                num_chunks = len(val) // chunk_size
+                remainder = len(val) % chunk_size
+                for i in range(num_chunks):
+                    # extract a chunk from dist tensor
+                    chunk_val = np.array(val[i*chunk_size:(i+1)*chunk_size])
+                    arr[i*chunk_size:(i+1)*chunk_size] = chunk_val
+                # need to write remainder
+                if remainder != 0:
+                    remainder_val = np.array(val[num_chunks*chunk_size:len(val)])
+                    arr[num_chunks*chunk_size:] = remainder_val
+            else:
+                arr[:] = np.array(val[0:len(val)])
+
 def write_data_hdf5(data, data_file):
     """ Write data into a HDF5 file.
 
     Parameters
     ----------
     data : dict
-        The data to be saved to the Parquet file.
+        The data to be saved to the hdf5 file.
     data_file : str
-        The file name of the Parquet file.
+        The file name of the hdf5 file.
     """
     with h5py.File(data_file, "w") as f:
         for key, val in data.items():
diff --git a/python/graphstorm/gconstruct/transform.py b/python/graphstorm/gconstruct/transform.py
index 3575942308..2361cafbd9 100644
--- a/python/graphstorm/gconstruct/transform.py
+++ b/python/graphstorm/gconstruct/transform.py
@@ -442,6 +442,7 @@ def call(self, feats):
 
 class NumericalMinMaxTransform(TwoPhaseFeatTransform):
     """ Numerical value with Min-Max normalization.
+
         $val = (val-min) / (max-min)$
 
     Parameters
@@ -451,17 +452,29 @@ class NumericalMinMaxTransform(TwoPhaseFeatTransform):
     feat_name : str
         The feature name used in the constructed graph.
     max_bound : float
-        The maximum float value.
+        The maximum float value. Any number larger than max_bound will be set to max_bound.
     min_bound : float
-        The minimum float value
+        The minimum float value. Any number smaller than min_bound will be set to min_bound.
+    max_val : list of float
+        Define the value of `max` in the Min-Max normalization formula for each feature.
+        If max_val is set, max_bound will be ignored.
+    min_val : list of float
+        Define the value of `min` in the Min-Max normalization formula for each feature.
+        If min_val is set, min_bound will be ignored.
     out_dtype:
         The dtype of the transformed feature.
         Default: None, we will not do data type casting.
+    transform_conf : dict
+        The configuration for the feature transformation.
     """
     def __init__(self, col_name, feat_name,
                  max_bound=sys.float_info.max,
                  min_bound=-sys.float_info.max,
-                 out_dtype=None):
+                 max_val=None, min_val=None,
+                 out_dtype=None, transform_conf=None):
+        self._max_val = np.array(max_val, dtype=np.float32) if max_val is not None else None
+        self._min_val = np.array(min_val, dtype=np.float32) if min_val is not None else None
+        self._conf = transform_conf
         self._max_bound = max_bound
         self._min_bound = min_bound
         out_dtype = np.float32 if out_dtype is None else out_dtype
@@ -477,6 +490,12 @@ def pre_process(self, feats):
         """
         assert isinstance(feats, (np.ndarray, ExtMemArrayWrapper)), \
             "Feature of NumericalMinMaxTransform must be numpy array or ExtMemArray"
+
+        # The max and min of $val = (val-min) / (max-min)$ is pre-defined
+        # in the transform_conf, return max_val and min_val directly
+        if self._max_val is not None and self._min_val is not None:
+            return {self.feat_name: (self._max_val, self._min_val)}
+
         if isinstance(feats, ExtMemArrayWrapper):
             # TODO(xiangsx): This is not memory efficient.
             # It will load all data into main memory.
@@ -493,15 +512,22 @@ def pre_process(self, feats):
                 feats = feats.astype(np.float32)
             except: # pylint: disable=bare-except
                 raise ValueError(f"The feature {self.feat_name} has to be integers or floats.")
-
         assert len(feats.shape) <= 2, "Only support 1D fp feature or 2D fp feature"
-        max_val = np.amax(feats, axis=0) if len(feats.shape) == 2 \
-            else np.array([np.amax(feats, axis=0)])
-        min_val = np.amin(feats, axis=0) if len(feats.shape) == 2 \
-            else np.array([np.amin(feats, axis=0)])
 
-        max_val[max_val > self._max_bound] = self._max_bound
-        min_val[min_val < self._min_bound] = self._min_bound
+        if self._max_val is None:
+            max_val = np.amax(feats, axis=0) if len(feats.shape) == 2 \
+                else np.array([np.amax(feats, axis=0)])
+            max_val[max_val > self._max_bound] = self._max_bound
+        else:
+            max_val = self._max_val
+
+        if self._min_val is None:
+            min_val = np.amin(feats, axis=0) if len(feats.shape) == 2 \
+                else np.array([np.amin(feats, axis=0)])
+            min_val[min_val < self._min_bound] = self._min_bound
+        else:
+            min_val = self._min_val
+
         return {self.feat_name: (max_val, min_val)}
 
     def update_info(self, info):
@@ -528,6 +554,11 @@ def update_info(self, info):
         self._max_val = max_val
         self._min_val = min_val
 
+        # We need to save the max_val and min_val in the config object.
+        if self._conf is not None:
+            self._conf['max_val'] = self._max_val.tolist()
+            self._conf['min_val'] = self._min_val.tolist()
+
     def call(self, feats):
         """ Do normalization for feats
 
@@ -878,11 +909,15 @@ def parse_feat_ops(confs):
             elif conf['name'] == 'max_min_norm':
                 max_bound = conf['max_bound'] if 'max_bound' in conf else sys.float_info.max
                 min_bound = conf['min_bound'] if 'min_bound' in conf else -sys.float_info.max
+                max_val = conf['max_val'] if 'max_val' in conf else None
+                min_val = conf['min_val'] if 'min_val' in conf else None
                 transform = NumericalMinMaxTransform(feat['feature_col'],
                                                      feat_name,
                                                      max_bound,
                                                      min_bound,
-                                                     out_dtype=out_dtype)
+                                                     max_val,
+                                                     min_val,
+                                                     out_dtype=out_dtype, transform_conf=conf)
             elif conf['name'] == 'rank_gauss':
                 epsilon = conf['epsilon'] if 'epsilon' in conf else None
                 transform = RankGaussTransform(feat['feature_col'],
diff --git a/python/graphstorm/gsf.py b/python/graphstorm/gsf.py
index 644bb380a9..f9d0acf8a3 100644
--- a/python/graphstorm/gsf.py
+++ b/python/graphstorm/gsf.py
@@ -81,16 +81,16 @@ class Options: # pylint: disable=missing-class-docstring
                              Options.local_size)
 
 def initialize(ip_config, backend, use_wholegraph=False):
-    """ Initialize distributed inference context
+    """ Initialize distributed training and inference context.
 
     Parameters
     ----------
     ip_config: str
-        File path of ip_config file
+        File path of ip_config file, e.g., `/tmp/ip_list.txt`.
     backend: str
-        Torch distributed backend
+        Torch distributed backend, e.g., ``gloo`` or ``nccl``.
     use_wholegraph: bool
-        Whether to use wholegraph for feature transfer
+        Whether to use wholegraph for feature transfer.
     """
     # We need to use socket for communication in DGL 0.8. The tensorpipe backend has a bug.
     # This problem will be fixed in the future.
@@ -116,6 +116,7 @@ def get_feat_size(g, node_feat_names):
     Returns
     -------
     dict of int : the feature size for each node type.
+
     """
     feat_size = {}
     for ntype in g.ntypes:
diff --git a/python/graphstorm/inference/ep_infer.py b/python/graphstorm/inference/ep_infer.py
index 5804e9fcc2..771cdc7f4e 100644
--- a/python/graphstorm/inference/ep_infer.py
+++ b/python/graphstorm/inference/ep_infer.py
@@ -40,10 +40,11 @@ class GSgnnEdgePredictionInferrer(GSInferrer):
     """
 
     def infer(self, loader, save_embed_path, save_prediction_path=None,
-              use_mini_batch_infer=False, # pylint: disable=unused-argument
-              node_id_mapping_file=None,
-              edge_id_mapping_file=None,
-              return_proba=True):
+            use_mini_batch_infer=False, # pylint: disable=unused-argument
+            node_id_mapping_file=None,
+            edge_id_mapping_file=None,
+            return_proba=True,
+            save_embed_format="pytorch"):
         """ Do inference
 
         The inference can do three things:
@@ -67,6 +68,8 @@ def infer(self, loader, save_embed_path, save_prediction_path=None,
             graph partition algorithm.
         return_proba: bool
             Whether to return all the predictions or the maximum prediction.
+        save_embed_format : str
+            Specify the format of saved embeddings.
         """
         do_eval = self.evaluator is not None
         if do_eval:
@@ -74,6 +77,10 @@ def infer(self, loader, save_embed_path, save_prediction_path=None,
                 "A label field must be provided for edge classification " \
                 "or regression inference when evaluation is required."
 
+        if use_mini_batch_infer:
+            assert save_embed_path is None, \
+                "Unable to save the node embeddings when using mini batch inference." \
+                "It is not guaranteed that mini-batch prediction will cover all the nodes."
         sys_tracker.check('start inferencing')
         self._model.eval()
 
@@ -120,10 +127,12 @@ def infer(self, loader, save_embed_path, save_prediction_path=None,
             # The order of the ntypes must be sorted
             embs = {ntype: embs[ntype] for ntype in sorted(target_ntypes)}
             save_gsgnn_embeddings(save_embed_path, embs, get_rank(),
-                                  get_world_size(), device=device,
-                                  node_id_mapping_file=node_id_mapping_file)
-        barrier()
-        sys_tracker.check('save embeddings')
+                get_world_size(),
+                device=device,
+                node_id_mapping_file=node_id_mapping_file,
+                save_embed_format=save_embed_format)
+            barrier()
+            sys_tracker.check('save embeddings')
 
         if save_prediction_path is not None:
             if edge_id_mapping_file is not None:
diff --git a/python/graphstorm/inference/lp_infer.py b/python/graphstorm/inference/lp_infer.py
index d82d485baf..c43bd0ab40 100644
--- a/python/graphstorm/inference/lp_infer.py
+++ b/python/graphstorm/inference/lp_infer.py
@@ -40,9 +40,10 @@ class GSgnnLinkPredictionInferrer(GSInferrer):
 
     # TODO(zhengda) We only support full-graph inference for now.
     def infer(self, data, loader, save_embed_path,
-              edge_mask_for_gnn_embeddings='train_mask',
-              use_mini_batch_infer=False,
-              node_id_mapping_file=None):
+            edge_mask_for_gnn_embeddings='train_mask',
+            use_mini_batch_infer=False,
+            node_id_mapping_file=None,
+            save_embed_format="pytorch"):
         """ Do inference
 
         The inference can do two things:
@@ -67,6 +68,8 @@ def infer(self, data, loader, save_embed_path,
         node_id_mapping_file: str
             Path to the file storing node id mapping generated by the
             graph partition algorithm.
+        save_embed_format : str
+            Specify the format of saved embeddings.
         """
         sys_tracker.check('start inferencing')
         self._model.eval()
@@ -82,10 +85,12 @@ def infer(self, data, loader, save_embed_path,
         device = self.device
         if save_embed_path is not None:
             save_gsgnn_embeddings(save_embed_path, embs, get_rank(),
-                                  get_world_size(), device=device,
-                                  node_id_mapping_file=node_id_mapping_file)
-        barrier()
-        sys_tracker.check('save embeddings')
+                get_world_size(),
+                device=device,
+                node_id_mapping_file=node_id_mapping_file,
+                save_embed_format=save_embed_format)
+            barrier()
+            sys_tracker.check('save embeddings')
 
         if self.evaluator is not None:
             test_start = time.time()
diff --git a/python/graphstorm/inference/np_infer.py b/python/graphstorm/inference/np_infer.py
index 344d181225..0204b1a002 100644
--- a/python/graphstorm/inference/np_infer.py
+++ b/python/graphstorm/inference/np_infer.py
@@ -43,7 +43,8 @@ class GSgnnNodePredictionInferrer(GSInferrer):
     def infer(self, loader, save_embed_path, save_prediction_path=None,
               use_mini_batch_infer=False,
               node_id_mapping_file=None,
-              return_proba=True):
+              return_proba=True,
+              save_embed_format="pytorch"):
         """ Do inference
 
         The inference does three things:
@@ -67,6 +68,8 @@ def infer(self, loader, save_embed_path, save_prediction_path=None,
             graph partition algorithm.
         return_proba: bool
             Whether to return all the predictions or the maximum prediction.
+        save_embed_format : str
+            Specify the format of saved embeddings.
         """
         do_eval = self.evaluator is not None
         if do_eval:
@@ -134,9 +137,11 @@ def infer(self, loader, save_embed_path, save_prediction_path=None,
                 ntype_emb = embs[ntype]
             embeddings = {ntype: ntype_emb}
 
-            save_gsgnn_embeddings(save_embed_path, embeddings,
-                                  get_rank(), get_world_size(), device=device,
-                                  node_id_mapping_file=node_id_mapping_file)
+            save_gsgnn_embeddings(save_embed_path, embeddings, get_rank(),
+                get_world_size(),
+                device=device,
+                node_id_mapping_file=node_id_mapping_file,
+                save_embed_format=save_embed_format)
             barrier()
             sys_tracker.check('save embeddings')
 
diff --git a/python/graphstorm/model/edge_gnn.py b/python/graphstorm/model/edge_gnn.py
index ce0fea7cfd..d9da341547 100644
--- a/python/graphstorm/model/edge_gnn.py
+++ b/python/graphstorm/model/edge_gnn.py
@@ -89,8 +89,9 @@ def predict(self, blocks, target_edges, node_feats, edge_feats,
             is true otherwise return the maximum value.
         """
 
-class GSgnnEdgeModelBase(GSgnnModelBase,  # pylint: disable=abstract-method
-                         GSgnnEdgeModelInterface):
+# pylint: disable=abstract-method
+class GSgnnEdgeModelBase(GSgnnEdgeModelInterface,
+                         GSgnnModelBase):
     """ The base class for edge-prediction GNN
 
     When a user wants to define an edge prediction GNN model and train the model
@@ -216,7 +217,7 @@ def edge_mini_batch_gnn_predict(model, loader, return_proba=True, return_label=F
                         for etype in target_edge_graph.canonical_etypes}
                 edge_decoder_feats = data.get_edge_feats(input_edges,
                                                          data.decoder_edge_feat,
-                                                         target_edge_graph.device)
+                                                         device)
                 edge_decoder_feats = {etype: feat.to(th.float32) \
                     for etype, feat in edge_decoder_feats.items()}
             else:
diff --git a/python/graphstorm/model/gnn.py b/python/graphstorm/model/gnn.py
index fb5c354336..5f59f63c9b 100644
--- a/python/graphstorm/model/gnn.py
+++ b/python/graphstorm/model/gnn.py
@@ -160,7 +160,8 @@ def restore_dense_model(self, restore_model_path,
         To restore model parameters for a model with a node_input_encoder, a
         GNN layer and a decoder:
 
-        .. code::
+        .. code:: python
+
             # suppose we are going to load all layers.
             input_encoder = self.input_encoder
             gnn_model = self.gnn_model
@@ -198,7 +199,8 @@ def restore_sparse_model(self, restore_model_path):
         --------
         To load sparse model parameters for a node_input_encoder
 
-        .. code::
+        .. code:: python
+
             from graphstorm.model.utils import load_sparse_emb
 
             for ntype, sparse_emb in sparse_embeds.items():
@@ -222,7 +224,8 @@ def save_dense_model(self, model_path):
         Example:
         --------
 
-        .. code::
+        .. code:: python
+
             # This function is only called by rank 0
             input_encoder = self.input_encoder
             gnn_model = self.gnn_model
@@ -259,18 +262,20 @@ def save_sparse_model(self, model_path):
 
         Step 1: Create a path to save the learnable node embeddings.
 
-        .. code::
-            from graphstorm.model.util import create_sparse_emb_path
+        .. code:: python
 
+            from graphstorm.model.util import create_sparse_emb_path
+           
             for ntype, sparse_emb in sparse_embeds.items():
                 create_sparse_emb_path(model_path, ntype)
             # make sure rank 0 creates the folder and change permission first
 
-
         Step 2: Save learnable node embeddings.
-        .. code::
-            from graphstorm.model.utils import save_sparse_emb
 
+        .. code:: python
+
+            from graphstorm.model.utils import save_sparse_emb
+           
             for ntype, sparse_emb in sparse_embeds.items():
                 save_sparse_emb(model_path, sparse_emb, ntype)
 
@@ -291,9 +296,11 @@ def restore_model(self, restore_model_path, model_layer_to_load=None):
         --------
         Load a model from "/tmp/checkpoints".
 
-        .. code::
+        .. code:: python
+
             # CustomGSgnnModel is a child class of GSgnnModelBase
             model = CustomGSgnnModel()
+            
             # Restore model parameters from "/tmp/checkpoints"
             model.restore_model("/tmp/checkpoints")
 
@@ -308,7 +315,8 @@ def restore_model(self, restore_model_path, model_layer_to_load=None):
         start_load_t = time.time()
         # Restore the model weights from a checkpoint saved previously.
         if restore_model_path is not None:
-            logging.debug('load model from %s', restore_model_path)
+            if get_rank() == 0:
+                logging.debug('load model from %s', restore_model_path)
             self.restore_dense_model(restore_model_path, model_layer_to_load)
 
             # If a user doesn't specify the layer to load,
@@ -316,7 +324,8 @@ def restore_model(self, restore_model_path, model_layer_to_load=None):
             if model_layer_to_load is None \
                     or GRAPHSTORM_MODEL_EMBED_LAYER in model_layer_to_load \
                     or GRAPHSTORM_MODEL_SPARSE_EMBED_LAYER in model_layer_to_load:
-                logging.debug('Load Sparse embedding from %s', restore_model_path)
+                if get_rank() == 0:
+                    logging.debug('Load Sparse embedding from %s', restore_model_path)
                 self.restore_sparse_model(restore_model_path)
 
         # We need to make sure that the sparse embedding is completely loaded
@@ -336,9 +345,11 @@ def save_model(self, model_path):
         --------
         Save a model into "/tmp/checkpoints".
 
-        .. code::
+        .. code:: python
+
             # CustomGSgnnModel is a child class of GSgnnModelBase
             model = CustomGSgnnModel()
+
             # Model parameters will be saved into "/tmp/checkpoints"
             model.save_model("/tmp/checkpoints")
 
@@ -377,12 +388,19 @@ def create_optimizer(self):
         optimizers.
 
         Example:
+        
         Case 1: if there is only one optimizer:
+        
+        .. code:: python
+
             def create_optimizer(self):
                 # define torch.optim.Optimizer
                 return optimizer
 
         Case 2: if there are both dense and sparse optimizers:
+        
+        .. code:: python
+
             def create_optimizer(self):
                 dense = [dense_opt] # define torch.optim.Optimizer
                 sparse = [sparse_opt] # define dgl sparse Optimizer
diff --git a/python/graphstorm/model/lp_gnn.py b/python/graphstorm/model/lp_gnn.py
index 60ea6dbfdd..5e29e6fa82 100644
--- a/python/graphstorm/model/lp_gnn.py
+++ b/python/graphstorm/model/lp_gnn.py
@@ -54,8 +54,9 @@ def forward(self, blocks, pos_graph, neg_graph,
         The loss of prediction.
         """
 
-class GSgnnLinkPredictionModelBase(GSgnnModelBase,  # pylint: disable=abstract-method
-                                   GSgnnLinkPredictionModelInterface):
+# pylint: disable=abstract-method
+class GSgnnLinkPredictionModelBase(GSgnnLinkPredictionModelInterface,
+                                   GSgnnModelBase):
     """ The base class for link-prediction GNN
 
     When a user wants to define a link prediction GNN model and train the model
diff --git a/python/graphstorm/model/node_gnn.py b/python/graphstorm/model/node_gnn.py
index 56504e1186..3e749756f7 100644
--- a/python/graphstorm/model/node_gnn.py
+++ b/python/graphstorm/model/node_gnn.py
@@ -80,8 +80,9 @@ def predict(self, blocks, node_feats, edge_feats, input_nodes, return_proba):
             The GNN embeddings.
         """
 
-class GSgnnNodeModelBase(GSgnnModelBase,  # pylint: disable=abstract-method
-                         GSgnnNodeModelInterface):
+# pylint: disable=abstract-method
+class GSgnnNodeModelBase(GSgnnNodeModelInterface,
+                         GSgnnModelBase):
     """ The base class for node-prediction GNN
 
     When a user wants to define a node prediction GNN model and train the model
diff --git a/python/graphstorm/model/rgat_encoder.py b/python/graphstorm/model/rgat_encoder.py
index a6c21880f5..118d372bee 100644
--- a/python/graphstorm/model/rgat_encoder.py
+++ b/python/graphstorm/model/rgat_encoder.py
@@ -176,6 +176,7 @@ class RelationalGATEncoder(GraphConvEncoder):
     r"""Relational graph attention encoder
 
     Parameters
+    -----------
     g : DGLHeteroGraph
         Input graph.
     h_dim: int
diff --git a/python/graphstorm/model/utils.py b/python/graphstorm/model/utils.py
index ff2458d3ab..304eb4fa90 100644
--- a/python/graphstorm/model/utils.py
+++ b/python/graphstorm/model/utils.py
@@ -27,8 +27,29 @@
 from torch.nn.parallel import DistributedDataParallel
 import dgl
 
-from ..utils import get_rank, get_world_size, barrier
-from ..data.utils import alltoallv_cpu
+from ..gconstruct.file_io import stream_dist_tensors_to_hdf5
+from ..utils import get_rank, barrier, get_world_size
+from ..data.utils import alltoallv_cpu, alltoallv_nccl
+
+def pad_file_index(file_index, width=5):
+    """ Left pad file_index with zerros.
+
+        for examaple, given 1, it will return 00001.
+
+        Parameters
+        ----------
+        file_index: int
+            Index of the file
+        width: int
+            Minimum length of resulting string; strings with length less
+            than width be prepended with 0 characters.
+
+        Return
+        ------
+        str: padded file_index
+    """
+    assert width > 1, "Width should be larger than 1"
+    return str(file_index).zfill(width)
 
 def sparse_emb_initializer(emb):
     """ Initialize sparse embedding
@@ -178,7 +199,7 @@ def save_sparse_emb(model_path, sparse_emb, ntype):
     # the create_sparse_embeds_path() method first before calling save_sparse_embeds().
     emb_path = os.path.join(model_path, ntype)
     os.makedirs(emb_path, exist_ok=True)
-    emb_file_path = os.path.join(emb_path, f'sparse_emb_{rank}.pt')
+    emb_file_path = os.path.join(emb_path, f'sparse_emb_{pad_file_index(rank)}.pt')
     th.save(embs, emb_file_path)
 
 def save_sparse_embeds(model_path, embed_layer):
@@ -312,6 +333,9 @@ def _exchange_node_id_mapping(rank, world_size, device,
         node_id_mapping_file: str
             Path to the file storing node id mapping generated by the
             graph partition algorithm.
+
+        Return:
+        Tensor: sub node_id_mappings corresponding to `rank`
     """
     backend = th.distributed.get_backend()
     device = th.device('cpu') if backend == "gloo" else device
@@ -340,12 +364,116 @@ def _exchange_node_id_mapping(rank, world_size, device,
     if backend == "gloo":
         alltoallv_cpu(rank, world_size, gather_list, data_tensors)
     else: # backend == "nccl"
-        th.distributed.all_to_all(gather_list, data_tensors)
-    return gather_list[0]
+        alltoallv_nccl(gather_list, data_tensors)
+    # move mapping into CPU
+    return gather_list[0].to(th.device("cpu"))
 
-def save_embeddings(model_path, embeddings, rank, world_size,
-                    device=th.device('cpu'), node_id_mapping_file=None):
-    """ Save embeddings in a distributed way
+def distribute_nid_map(embeddings, rank, world_size,
+    node_id_mapping_file, device=th.device('cpu')):
+    """ Distribute nid_map to all workers.
+
+        Parameters
+        ----------
+        embeddings : DistTensor
+            Embeddings to save
+        rank : int
+            Local rank
+        world_size : int
+            World size in a distributed env.
+        node_id_mapping_file: str
+            Path to the file storing node id mapping generated by the
+            graph partition algorithm.
+        device: torch device
+            Device used for all_to_allv data exchange. For gloo backend
+            we store data in CPU, For nccl backend, we need to store
+            data in GPU.
+
+        Returns
+        _______
+        Dict of list: Embeddings index from original order.
+    """
+    assert node_id_mapping_file is not None
+    if isinstance(embeddings, (dgl.distributed.DistTensor, LazyDistTensor)):
+        # only host 0 will load node id mapping from disk
+        if rank == 0:
+            ori_node_id_mapping = th.load(node_id_mapping_file)
+            _, node_id_mapping = th.sort(ori_node_id_mapping)
+        else:
+            node_id_mapping = None
+
+        nid_mapping = _exchange_node_id_mapping(
+            rank, world_size, device, node_id_mapping, len(embeddings))
+    elif isinstance(embeddings, dict):
+        nid_mapping = {}
+        # only host 0 will load node id mapping from disk
+        node_id_mappings = th.load(node_id_mapping_file) \
+            if rank == 0 else None
+
+        for name, emb in embeddings.items():
+            if rank == 0:
+                assert name in node_id_mappings, \
+                    f"node id mapping for ntype {name} should exists"
+                # new mapping back index
+                ori_node_id_mapping = node_id_mappings[name]
+                _, node_id_mapping = th.sort(ori_node_id_mapping)
+            else:
+                node_id_mapping = None
+
+            nid_mapping[name] = _exchange_node_id_mapping(
+                rank, world_size, device, node_id_mapping, len(emb))
+    else:
+        nid_mapping = None
+    return nid_mapping
+
+def remap_embeddings(embeddings, rank, world_size,
+    node_id_mapping_file, device=th.device('cpu')):
+    """ Remap embeddings by nid_map without writing to disk.
+
+        Parameters
+        ----------
+        embeddings : DistTensor
+            Embeddings to save
+        rank : int
+            Local rank
+        world_size : int
+            World size in a distributed env.
+        node_id_mapping_file: str
+            Path to the file storing node id mapping generated by the
+            graph partition algorithm.
+        device: torch device
+            Device used for all_to_allv data exchange. For gloo backend
+            we store data in CPU, For nccl backend, we need to store
+            data in GPU.
+
+        Returns
+        _______
+        DistTensor : remapped DistTensor
+    """
+    assert node_id_mapping_file is not None
+
+    # TODO: handle when node_id_mapping_file is None.
+    nid_mapping = distribute_nid_map(embeddings, rank, world_size,
+            node_id_mapping_file, device)
+
+    if isinstance(embeddings, (dgl.distributed.DistTensor, LazyDistTensor)):
+        start, end = _get_data_range(rank, world_size, len(embeddings))
+        embeddings[list(range(start, end))] = embeddings[nid_mapping]
+    elif isinstance(embeddings, dict):
+        # We need to duplicate the dict so that the input argument is not changed.
+        embeddings = dict(embeddings.items())
+        for name, emb in embeddings.items():
+            if isinstance(emb, (dgl.distributed.DistTensor, LazyDistTensor)):
+                # this is the same window as nid_mapping
+                start, end = _get_data_range(rank, world_size, len(emb))
+                # we need to keep emb to be dist tensor unchanged
+                emb[th.arange(start, end)] = emb[nid_mapping[name]]
+            th.distributed.barrier()
+
+    return embeddings
+
+def save_pytorch_embeddings(model_path, embeddings, rank, world_size,
+    device=th.device('cpu'), node_id_mapping_file=None):
+    """ Save embeddings through pytorch a distributed way
 
         Parameters
         ----------
@@ -353,11 +481,10 @@ def save_embeddings(model_path, embeddings, rank, world_size,
             The path of the folder where the model is saved.
         embeddings : DistTensor
             Embeddings to save
-        rank: int
+        rank : int
             Rank of the current process in a distributed environment.
         world_size : int
-            World size in a distributed environment. This tells the size of a distributed cluster
-            (How many processes in a cluster).
+            World size in a distributed env.
         device: torch device
             Device used for all_to_allv data exchange. For gloo backend
             we store data in CPU, For nccl backend, we need to store
@@ -366,7 +493,6 @@ def save_embeddings(model_path, embeddings, rank, world_size,
             Path to the file storing node id mapping generated by the
             graph partition algorithm.
     """
-    os.makedirs(model_path, exist_ok=True)
     # [04/16]: Only rank 0 can chmod to let all other ranks to write files.
     if rank == 0:
         # mode 767 means rwx-rw-rwx:
@@ -383,36 +509,8 @@ def save_embeddings(model_path, embeddings, rank, world_size,
     # less than 10 billion. An ID mapping of 10 billion nodes
     # will take around 80 GByte.
     if node_id_mapping_file is not None:
-        if isinstance(embeddings, (dgl.distributed.DistTensor, LazyDistTensor)):
-            # only host 0 will load node id mapping from disk
-            if rank == 0:
-                ori_node_id_mapping = th.load(node_id_mapping_file)
-                _, node_id_mapping = th.sort(ori_node_id_mapping)
-            else:
-                node_id_mapping = None
-
-            nid_mapping = _exchange_node_id_mapping(
-                rank, world_size, device, node_id_mapping, len(embeddings))
-        elif isinstance(embeddings, dict):
-            nid_mapping = {}
-            # only host 0 will load node id mapping from disk
-            node_id_mappings = th.load(node_id_mapping_file) \
-                if rank == 0 else None
-
-            for name, emb in embeddings.items():
-                if rank == 0:
-                    assert name in node_id_mappings, \
-                        f"node id mapping for ntype {name} should exists"
-                    # new mapping back index
-                    ori_node_id_mapping = node_id_mappings[name]
-                    _, node_id_mapping = th.sort(ori_node_id_mapping)
-                else:
-                    node_id_mapping = None
-
-                nid_mapping[name] = _exchange_node_id_mapping(
-                    rank, world_size, device, node_id_mapping, len(emb))
-        else:
-            nid_mapping = None
+        nid_mapping = distribute_nid_map(embeddings, rank, world_size,
+            node_id_mapping_file, device)
     else:
         nid_mapping = None
 
@@ -442,16 +540,86 @@ def save_embeddings(model_path, embeddings, rank, world_size,
     if isinstance(embeddings, dict):
         # embedding per node type
         for name, emb in embeddings.items():
-            th.save(emb, os.path.join(model_path, f'{name}_emb.part{rank}.bin'))
+            th.save(emb, os.path.join(model_path, f'{name}_emb.part{pad_file_index(rank)}.bin'))
             emb_info["emb_name"].append(name)
     else:
-        th.save(embeddings, os.path.join(model_path, f'emb.part{rank}.bin'))
+        th.save(embeddings, os.path.join(model_path, f'emb.part{pad_file_index(rank)}.bin'))
         emb_info["emb_name"] = None
 
     if rank == 0:
         with open(os.path.join(model_path, "emb_info.json"), 'w', encoding='utf-8') as f:
             f.write(json.dumps(emb_info))
 
+def save_hdf5_embeddings(model_path, embeddings, rank, world_size,
+    device=th.device('cpu'), node_id_mapping_file=None):
+    """ Save embeddings through hdf5 into a single file.
+
+        Parameters
+        ----------
+        model_path : str
+            The path of the folder where the model is saved.
+        embeddings : DistTensor
+            Embeddings to save
+        rank : int
+            Rank of the current process in a distributed environment.
+        world_size : int
+            World size in a distributed env.
+        device: torch device
+            Device used for all_to_allv data exchange. For gloo backend
+            we store data in CPU, For nccl backend, we need to store
+            data in GPU.
+        node_id_mapping_file: str
+            Path to the file storing node id mapping generated by the
+            graph partition algorithm.
+    """
+    mapped_embeds = remap_embeddings(embeddings, rank, world_size,
+    node_id_mapping_file, device)
+    if rank == 0:
+        stream_dist_tensors_to_hdf5(mapped_embeds, os.path.join(model_path, "embed_dict.hdf5"))
+
+def save_embeddings(model_path, embeddings, rank, world_size,
+    device=th.device('cpu'), node_id_mapping_file=None,
+    save_embed_format="pytorch"):
+    """ Save embeddings.
+
+        Parameters
+        ----------
+        model_path : str
+            The path of the folder where the model is saved.
+        embeddings : DistTensor
+            Embeddings to save
+        rank : int
+            Rank of the current process in a distributed environment.
+        world_size : int
+            World size in a distributed env.
+        device: torch device
+            Device used for all_to_allv data exchange. For gloo backend
+            we store data in CPU, For nccl backend, we need to store
+            data in GPU.
+        node_id_mapping_file : str
+            Path to the file storing node id mapping generated by the
+            graph partition algorithm.
+        save_embed_format : str
+            The format of saved embeddings.
+            Currently support ["pytorch", "hdf5"].
+    """
+    os.makedirs(model_path, exist_ok=True)
+    if save_embed_format == "pytorch":
+        if rank == 0:
+            logging.info("Writing GNN embeddings to "\
+                "%s in pytorch format.", model_path)
+        save_pytorch_embeddings(model_path, embeddings, rank, world_size,
+            device, node_id_mapping_file)
+    elif save_embed_format == "hdf5":
+        if rank == 0:
+            logging.info("Writing GNN embeddings to "\
+                "%s in hdf5 format.", \
+                os.path.join(model_path, 'embed_dict.hdf5'))
+        save_hdf5_embeddings(model_path, embeddings, rank, world_size,
+            device, node_id_mapping_file)
+    else:
+        raise ValueError(f"{model_path} is not supported for save_embed_format")
+
 def shuffle_predict(predictions, id_mapping_file, pred_type,
                     rank, world_size, device):
     """ Shuffle prediction result according to id_mapping
@@ -511,7 +679,7 @@ def save_prediction_results(predictions, prediction_path, rank):
     # make sure the prediction_path permission is changed before other process start to save
     barrier()
 
-    th.save(predictions, os.path.join(prediction_path, "predict-{}.pt".format(rank)))
+    th.save(predictions, os.path.join(prediction_path, f"predict-{pad_file_index(rank)}.pt"))
 
 def load_model(model_path, gnn_model=None, embed_layer=None, decoder=None):
     """ Load a complete gnn model.
@@ -596,7 +764,8 @@ def load_sparse_emb(target_sparse_emb, ntype_emb_path):
     for i in range(math.ceil(num_files/world_size)):
         file_idx = i * world_size + rank
         if file_idx < num_files:
-            emb = th.load(os.path.join(ntype_emb_path, f'sparse_emb_{file_idx}.pt'))
+            emb = th.load(os.path.join(ntype_emb_path,
+                                       f'sparse_emb_{pad_file_index(file_idx)}.pt'))
 
             # Get the target idx range for sparse_emb_{rank}.pt
             start, end = _get_sparse_emb_range(num_embs,
diff --git a/python/graphstorm/run/gsgnn_ep/ep_infer_gnn.py b/python/graphstorm/run/gsgnn_ep/ep_infer_gnn.py
index f43492ed44..cf2f7b622a 100644
--- a/python/graphstorm/run/gsgnn_ep/ep_infer_gnn.py
+++ b/python/graphstorm/run/gsgnn_ep/ep_infer_gnn.py
@@ -53,7 +53,8 @@ def main(config_args):
                                     label_field=config.label_field,
                                     decoder_edge_feat=config.decoder_edge_feat)
     model = gs.create_builtin_edge_gnn_model(infer_data.g, config, train_task=False)
-    model.restore_model(config.restore_model_path)
+    model.restore_model(config.restore_model_path,
+                        model_layer_to_load=config.restore_model_layers)
     # TODO(zhengda) we should use a different way to get rank.
     infer = GSgnnEdgePredictionInferrer(model)
     infer.setup_device(device=device)
@@ -80,16 +81,13 @@ def main(config_args):
                                      remove_target_edge_type=config.remove_target_edge_type,
                                      construct_feat_ntype=config.construct_feat_ntype,
                                      construct_feat_fanout=config.construct_feat_fanout)
-    # Preparing input layer for training or inference.
-    # The input layer can pre-compute node features in the preparing step if needed.
-    # For example pre-compute all BERT embeddings
-    model.prepare_input_encoder(infer_data)
     infer.infer(dataloader, save_embed_path=config.save_embed_path,
                 save_prediction_path=config.save_prediction_path,
                 use_mini_batch_infer=config.use_mini_batch_infer,
                 node_id_mapping_file=config.node_id_mapping_file,
                 edge_id_mapping_file=config.edge_id_mapping_file,
-                return_proba=config.return_proba)
+                return_proba=config.return_proba,
+                save_embed_format=config.save_embed_format)
 
 def generate_parser():
     """ Generate an argument parser
diff --git a/python/graphstorm/run/gsgnn_ep/ep_infer_lm.py b/python/graphstorm/run/gsgnn_ep/ep_infer_lm.py
index f6b34442b0..7e43ff18cb 100644
--- a/python/graphstorm/run/gsgnn_ep/ep_infer_lm.py
+++ b/python/graphstorm/run/gsgnn_ep/ep_infer_lm.py
@@ -54,7 +54,8 @@ def main(config_args):
                                     label_field=config.label_field,
                                     decoder_edge_feat=config.decoder_edge_feat)
     model = gs.create_builtin_edge_model(infer_data.g, config, train_task=False)
-    model.restore_model(config.restore_model_path)
+    model.restore_model(config.restore_model_path,
+                        model_layer_to_load=config.restore_model_layers)
     infer = GSgnnEdgePredictionInferrer(model)
     infer.setup_device(device=device)
     if not config.no_validation:
@@ -76,7 +77,8 @@ def main(config_args):
                 use_mini_batch_infer=config.use_mini_batch_infer,
                 node_id_mapping_file=config.node_id_mapping_file,
                 edge_id_mapping_file=config.edge_id_mapping_file,
-                return_proba=config.return_proba)
+                return_proba=config.return_proba,
+                save_embed_format=config.save_embed_format)
 
 def generate_parser():
     """ Generate an argument parser
diff --git a/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py b/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py
index c80cf66d23..e263011287 100644
--- a/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py
+++ b/python/graphstorm/run/gsgnn_lp/lp_infer_gnn.py
@@ -43,7 +43,8 @@ def main(config_args):
                                     node_feat_field=config.node_feat_name,
                                     decoder_edge_feat=config.decoder_edge_feat)
     model = gs.create_builtin_lp_gnn_model(infer_data.g, config, train_task=False)
-    model.restore_model(config.restore_model_path)
+    model.restore_model(config.restore_model_path,
+                        model_layer_to_load=config.restore_model_layers)
     infer = GSgnnLinkPredictionInferrer(model)
     infer.setup_device(device=device)
     if not config.no_validation:
@@ -69,16 +70,13 @@ def main(config_args):
                                      batch_size=config.eval_batch_size,
                                      num_negative_edges=config.num_negative_edges_eval,
                                      fanout=config.eval_fanout)
-    # Preparing input layer for training or inference.
-    # The input layer can pre-compute node features in the preparing step if needed.
-    # For example pre-compute all BERT embeddings
-    model.prepare_input_encoder(infer_data)
     infer.infer(infer_data, dataloader,
                 save_embed_path=config.save_embed_path,
                 edge_mask_for_gnn_embeddings=None if config.no_validation else \
                     'train_mask', # if no validation,any edge can be used in message passing.
                 use_mini_batch_infer=config.use_mini_batch_infer,
-                node_id_mapping_file=config.node_id_mapping_file)
+                node_id_mapping_file=config.node_id_mapping_file,
+                save_embed_format=config.save_embed_format)
 
 def generate_parser():
     """ Generate an argument parser
diff --git a/python/graphstorm/run/gsgnn_lp/lp_infer_lm.py b/python/graphstorm/run/gsgnn_lp/lp_infer_lm.py
index 4150a6a9af..e196d3fd83 100644
--- a/python/graphstorm/run/gsgnn_lp/lp_infer_lm.py
+++ b/python/graphstorm/run/gsgnn_lp/lp_infer_lm.py
@@ -44,7 +44,8 @@ def main(config_args):
                                     node_feat_field=config.node_feat_name,
                                     decoder_edge_feat=config.decoder_edge_feat)
     model = gs.create_builtin_lp_model(infer_data.g, config, train_task=False)
-    model.restore_model(config.restore_model_path)
+    model.restore_model(config.restore_model_path,
+                        model_layer_to_load=config.restore_model_layers)
     infer = GSgnnLinkPredictionInferrer(model)
     infer.setup_device(device=device)
     if not config.no_validation:
@@ -77,7 +78,8 @@ def main(config_args):
                 save_embed_path=config.save_embed_path,
                 edge_mask_for_gnn_embeddings=None, # LM infer does not use GNN
                 use_mini_batch_infer=config.use_mini_batch_infer,
-                node_id_mapping_file=config.node_id_mapping_file)
+                node_id_mapping_file=config.node_id_mapping_file,
+                save_embed_format=config.save_embed_format)
 
 def generate_parser():
     """ Generate an argument parser
diff --git a/python/graphstorm/run/gsgnn_np/np_infer_gnn.py b/python/graphstorm/run/gsgnn_np/np_infer_gnn.py
index d8911ccfd7..e5cc4e629d 100644
--- a/python/graphstorm/run/gsgnn_np/np_infer_gnn.py
+++ b/python/graphstorm/run/gsgnn_np/np_infer_gnn.py
@@ -15,7 +15,6 @@
 
     Inference script for node classification/regression tasks with GNN
 """
-
 import graphstorm as gs
 from graphstorm.config import get_argument_parser
 from graphstorm.config import GSConfig
@@ -52,7 +51,8 @@ def main(config_args):
                                     node_feat_field=config.node_feat_name,
                                     label_field=config.label_field)
     model = gs.create_builtin_node_gnn_model(infer_data.g, config, train_task=False)
-    model.restore_model(config.restore_model_path)
+    model.restore_model(config.restore_model_path,
+                        model_layer_to_load=config.restore_model_layers)
     infer = GSgnnNodePredictionInferrer(model)
     infer.setup_device(device=device)
     if not config.no_validation:
@@ -76,15 +76,12 @@ def main(config_args):
                                      train_task=False,
                                      construct_feat_ntype=config.construct_feat_ntype,
                                      construct_feat_fanout=config.construct_feat_fanout)
-    # Preparing input layer for training or inference.
-    # The input layer can pre-compute node features in the preparing step if needed.
-    # For example pre-compute all BERT embeddings
-    model.prepare_input_encoder(infer_data)
     infer.infer(dataloader, save_embed_path=config.save_embed_path,
                 save_prediction_path=config.save_prediction_path,
                 use_mini_batch_infer=config.use_mini_batch_infer,
                 node_id_mapping_file=config.node_id_mapping_file,
-                return_proba=config.return_proba)
+                return_proba=config.return_proba,
+                save_embed_format=config.save_embed_format)
 
 def generate_parser():
     """ Generate an argument parser
diff --git a/python/graphstorm/run/launch.py b/python/graphstorm/run/launch.py
index ce089c6bc3..b23d19a263 100644
--- a/python/graphstorm/run/launch.py
+++ b/python/graphstorm/run/launch.py
@@ -591,9 +591,6 @@ def update_udf_command(udf_command, args):
     udf_command.append("--part-config")
     udf_command.append(args.part_config)
 
-    udf_command.append("--verbose")
-    udf_command.append(str(args.verbose))
-
     return udf_command
 
 def get_available_port(ip):
@@ -755,9 +752,7 @@ def submit_jobs(args, udf_command):
                 cmd, state_q, ip, args.ssh_port, username=args.ssh_username
             )
         )
-
-        if args.verbose:
-            logging.debug(torch_dist_udf_command)
+        logging.debug(torch_dist_udf_command)
 
     # Start a cleanup process dedicated for cleaning up remote training jobs.
     conn1, conn2 = multiprocessing.Pipe()
@@ -804,11 +799,12 @@ def get_argument_parser():
               then the ssh command will be like: 'ssh bob@1.2.3.4 CMD' "
              "instead of 'ssh 1.2.3.4 CMD'",
     )
+    # We should deprecate it.
     parser.add_argument(
         "--verbose",
         type=lambda x: (str(x).lower() in ['true', '1']),
         default=False,
-        help="Print more information.",
+        help="Print more information. This argument is deprecated and is no longer effective.",
     )
     parser.add_argument(
         "--workspace",
@@ -935,9 +931,8 @@ def check_input_arguments(args):
         args.num_omp_threads = max(
             cpu_cores_per_trainer, 1
         )
-        if args.verbose:
-            logging.debug("The number of OMP threads per trainer is set to %d",
-                          args.num_omp_threads)
+        logging.debug("The number of OMP threads per trainer is set to %d",
+                      args.num_omp_threads)
     else:
         assert args.num_omp_threads > 0, \
             "The number of OMP threads per trainer should be larger than 0"
@@ -949,9 +944,8 @@ def check_input_arguments(args):
         args.num_server_threads = max(
             cpu_cores_per_server, 1
         )
-        if args.verbose:
-            logging.debug("The number of OMP threads per server is set to %d",
-                          args.num_server_threads)
+        logging.debug("The number of OMP threads per server is set to %d",
+                      args.num_server_threads)
     else:
         assert args.num_server_threads > 0, \
             "The number of OMP threads per server should be larger than 1"
diff --git a/python/graphstorm/trainer/ep_trainer.py b/python/graphstorm/trainer/ep_trainer.py
index adb1cf9d62..a7de98bac1 100644
--- a/python/graphstorm/trainer/ep_trainer.py
+++ b/python/graphstorm/trainer/ep_trainer.py
@@ -142,7 +142,7 @@ def fit(self, train_loader, num_epochs,
                             for etype in batch_graph.canonical_etypes}
                     edge_decoder_feats = data.get_edge_feats(input_edges,
                                                              data.decoder_edge_feat,
-                                                             batch_graph.device)
+                                                             device)
                     edge_decoder_feats = {etype: feat.to(th.float32) \
                         for etype, feat in edge_decoder_feats.items()}
                 else:
@@ -283,6 +283,19 @@ def eval(self, model, val_loader, test_loader, use_mini_batch_infer, total_steps
         """
         test_start = time.time()
         sys_tracker.check('start prediction')
+
+        metric = set(self.evaluator.metric)
+        need_proba = metric.intersection({'roc_auc', 'per_class_roc_auc', 'precision_recall'})
+        need_label_pred = metric.intersection({'accuracy', 'f1_score', 'per_class_f1_score'})
+        assert len(need_proba) == 0 or len(need_label_pred) == 0, \
+            f"{need_proba} requires return_proba==True, \
+                         but {need_label_pred} requires return_proba==False."
+        if len(need_proba) > 0 and return_proba is False:
+            return_proba = True
+            logging.warning("%s requires return_proba==True. \
+                Set return_proba to True.", need_proba)
+
+
         model.eval()
         if use_mini_batch_infer:
             val_pred, val_label = edge_mini_batch_gnn_predict(model, val_loader, return_proba,
diff --git a/python/graphstorm/trainer/np_trainer.py b/python/graphstorm/trainer/np_trainer.py
index 46323d634c..a7dccf8532 100644
--- a/python/graphstorm/trainer/np_trainer.py
+++ b/python/graphstorm/trainer/np_trainer.py
@@ -261,6 +261,18 @@ def eval(self, model, val_loader, test_loader, use_mini_batch_infer, total_steps
         """
         teval = time.time()
         sys_tracker.check('before prediction')
+
+        metric = set(self.evaluator.metric)
+        need_proba = metric.intersection({'roc_auc', 'per_class_roc_auc', 'precision_recall'})
+        need_label_pred = metric.intersection({'accuracy', 'f1_score', 'per_class_f1_score'})
+        assert len(need_proba) == 0 or len(need_label_pred) == 0, \
+            f"{need_proba} requires return_proba==True, \
+                         but {need_label_pred} requires return_proba==False."
+        if len(need_proba) > 0 and return_proba is False:
+            return_proba = True
+            logging.warning("%s requires return_proba==True. \
+                Set return_proba to True.", need_proba)
+
         if use_mini_batch_infer:
             val_pred, _, val_label = node_mini_batch_gnn_predict(model, val_loader, return_proba,
                                                                  return_label=True)
diff --git a/python/graphstorm/utils.py b/python/graphstorm/utils.py
index c5c13a0145..4d0f625480 100644
--- a/python/graphstorm/utils.py
+++ b/python/graphstorm/utils.py
@@ -30,7 +30,16 @@
 TORCH_MAJOR_VER = int(th.__version__.split('.', maxsplit=1)[0])
 
 def setup_device(local_rank):
-    """Setup computation device
+    r"""Setup computation device.
+    
+    Parameters
+    -----------
+    local_rank: int
+        Rank of the current process in a distributed environment.
+
+    Returns
+    -------
+    str: device where the model runs.
     """
     if th.cuda.is_available():
         assert local_rank < th.cuda.device_count(), \
diff --git a/tests/end2end-tests/data_process/compare_graphs.py b/tests/end2end-tests/data_process/compare_graphs.py
index d44025ad4a..5d02a9d857 100644
--- a/tests/end2end-tests/data_process/compare_graphs.py
+++ b/tests/end2end-tests/data_process/compare_graphs.py
@@ -19,6 +19,8 @@
 import dgl
 import numpy as np
 
+from numpy.testing import assert_almost_equal
+
 argparser = argparse.ArgumentParser("Compare graphs")
 argparser.add_argument("--graph-path1", type=str, required=True,
                        help="The path of the constructed graph.")
@@ -35,11 +37,14 @@
     for name in g1.nodes[ntype].data:
         # We should skip '*_mask' because data split is split randomly.
         if 'mask' not in name:
-            assert np.all(g1.nodes[ntype].data[name].numpy() == g2.nodes[ntype].data[name].numpy())
+            assert_almost_equal(g1.nodes[ntype].data[name].numpy(),
+                                g2.nodes[ntype].data[name].numpy())
+
 
 for etype in g1.canonical_etypes:
     assert g1.number_of_edges(etype) == g2.number_of_edges(etype)
     for name in g1.edges[etype].data:
         # We should skip '*_mask' because data split is split randomly.
         if 'mask' not in name:
-            assert np.all(g1.edges[etype].data[name].numpy() == g2.edges[etype].data[name].numpy())
+            assert_almost_equal(g1.edges[etype].data[name].numpy(),
+                                g2.edges[etype].data[name].numpy())
diff --git a/tests/end2end-tests/data_process/data_gen.py b/tests/end2end-tests/data_process/data_gen.py
index 4c8525e639..abab89ea3d 100644
--- a/tests/end2end-tests/data_process/data_gen.py
+++ b/tests/end2end-tests/data_process/data_gen.py
@@ -41,6 +41,7 @@ def gen_rand_nid(max_nid, num_nodes):
         return gen_rand_nid(max_nid, num_nodes)
     return node_ids
 
+np.random.seed(1)
 node_id1 = gen_rand_nid(1000000000, 10000)
 node_text = np.array([str(nid) for nid in node_id1])
 node_data1 = {
diff --git a/tests/end2end-tests/graphstorm-ec/mgpu_test.sh b/tests/end2end-tests/graphstorm-ec/mgpu_test.sh
index e702620cff..71928c453a 100644
--- a/tests/end2end-tests/graphstorm-ec/mgpu_test.sh
+++ b/tests/end2end-tests/graphstorm-ec/mgpu_test.sh
@@ -173,3 +173,10 @@ echo "**************dataset: Generated multilabel MovieLens EC, RGCN layer: 1, n
 python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_scripts/gsgnn_ep/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_multi_label_ec/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec.yaml --exclude-training-targets True --multilabel true --num-classes 6 --node-feat-name movie:title user:feat --use-mini-batch-infer false --num-epochs 1 --backend nccl
 
 error_and_exit $?
+
+echo "**************dataset: Generated multilabel MovieLens EC, RGCN layer: 1, node feat: generated feature, inference: minibatch, exclude-training-targets: True, decoder edge feat: label"
+python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_scripts/gsgnn_ep/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_multi_label_ec/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222 --cf ml_ec.yaml --exclude-training-targets True --multilabel true --num-classes 6 --node-feat-name movie:title user:feat --use-mini-batch-infer true --topk-model-to-save 1  --save-embed-path /data/gsgnn_ec/emb/ --save-model-path /data/gsgnn_ec/ --save-model-frequency 1000 --decoder-edge-feat user,rating,movie:rate --fanout 'user/rating/movie:4@movie/rating-rev/user:5,user/rating/movie:2@movie/rating-rev/user:2' --num-layers 2 --decoder-type MLPEFeatEdgeDecoder
+
+error_and_exit $?
+rm -fr /data/gsgnn_ec/*
+
diff --git a/tests/end2end-tests/graphstorm-ec/test.sh b/tests/end2end-tests/graphstorm-ec/test.sh
index a915ea0515..2413940d16 100644
--- a/tests/end2end-tests/graphstorm-ec/test.sh
+++ b/tests/end2end-tests/graphstorm-ec/test.sh
@@ -79,8 +79,8 @@ python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_s
 error_and_exit $?
 
 # TODO(zhengda) Failure found during evaluation of the auc metric returning -1 multiclass format is not supported
-echo "**************dataset: Test edge classification, RGCN layer: 1, node feat: fixed HF BERT, BERT nodes: movie, inference: mini-batch, eval_metric: precision_recall accuracy"
-python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_scripts/gsgnn_ep/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_ec_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222  --cf ml_ec.yaml --part-config /data/movielen_100k_ec_1p_4t/movie-lens-100k.json --eval-metric precision_recall accuracy --num-epochs 1
+echo "**************dataset: Test edge classification, RGCN layer: 1, node feat: fixed HF BERT, BERT nodes: movie, inference: mini-batch, eval_metric: accuracy"
+python3 -m graphstorm.run.gs_edge_classification --workspace $GS_HOME/training_scripts/gsgnn_ep/ --num-trainers $NUM_TRAINERS --num-servers 1 --num-samplers 0 --part-config /data/movielen_100k_ec_1p_4t/movie-lens-100k.json --ip-config ip_list.txt --ssh-port 2222  --cf ml_ec.yaml --part-config /data/movielen_100k_ec_1p_4t/movie-lens-100k.json --eval-metric precision_recall --num-epochs 1
 
 error_and_exit $?
 
diff --git a/tests/unit-tests/gconstruct/test_transform.py b/tests/unit-tests/gconstruct/test_transform.py
index 9fb246b0f6..d11d7ca91e 100644
--- a/tests/unit-tests/gconstruct/test_transform.py
+++ b/tests/unit-tests/gconstruct/test_transform.py
@@ -112,7 +112,10 @@ def test_fp_transform(input_dtype):
         assert_equal(min_val[i], -5.)
 
     # Test collect info
-    transform = NumericalMinMaxTransform("test", "test")
+    transform_conf = {
+        "name": "max_min_norm"
+    }
+    transform = NumericalMinMaxTransform("test", "test", transform_conf=transform_conf)
     info = [(np.array([1.]), np.array([-1.])),
             (np.array([2.]), np.array([-0.5])),
             (np.array([0.5]), np.array([-0.1]))]
@@ -121,6 +124,10 @@ def test_fp_transform(input_dtype):
     assert len(transform._min_val) == 1
     assert_equal(transform._max_val[0], 2.)
     assert_equal(transform._min_val[0], -1.)
+    assert 'max_val' in transform_conf
+    assert 'min_val' in transform_conf
+    assert_equal(np.array(transform_conf['max_val']), 2.)
+    assert_equal(np.array(transform_conf['min_val']), -1.)
 
     info = [(np.array([1., 2., 3.]), np.array([-1., -2., 0.5])),
             (np.array([2., 1., 3.]), np.array([-0.5, -3., 0.1])),
@@ -130,6 +137,84 @@ def test_fp_transform(input_dtype):
     assert len(transform._min_val) == 3
     assert_equal(transform._max_val[0], 2.)
     assert_equal(transform._min_val[0], -1.)
+    assert 'max_val' in transform_conf
+    assert 'min_val' in transform_conf
+    assert_equal(np.array(transform_conf['max_val']),
+                 np.array([2.,3.,3.]))
+    assert_equal(np.array(transform_conf['min_val']),
+                 np.array([-1.,-3.,0.1]))
+
+    transform_conf = {
+        "name": "max_min_norm",
+        "max_val": [1.,1.,1.],
+        "min_val": [-1.,-1.,-1.]
+    }
+    transform = NumericalMinMaxTransform("test", "test",
+                                        max_val=transform_conf['max_val'],
+                                        min_val=transform_conf['min_val'],
+                                        transform_conf=transform_conf)
+    feats = 2 * np.random.randn(10, 3).astype(input_dtype)
+    feats[0][0] = 2
+    feats[0][1] = -2
+    info = transform.pre_process(feats)
+    max_val = np.array(transform_conf['max_val'])
+    min_val = np.array(transform_conf["min_val"])
+    assert_equal(info["test"][0], max_val)
+    assert_equal(info["test"][1], min_val)
+    transform.update_info([info["test"]])
+    assert_equal(np.array(transform_conf['max_val']),
+                 np.array([1.,1.,1.]))
+    assert_equal(np.array(transform_conf['min_val']),
+                 np.array([-1.,-1.,-1.]))
+    result = transform(feats)
+    true_result = (feats - min_val) / (max_val - min_val)
+    true_result[true_result > 1] = 1
+    true_result[true_result < 0] = 0
+    assert_almost_equal(result["test"].astype(input_dtype), true_result)
+
+    transform_conf = {
+        "name": "max_min_norm",
+        "min_val": [-1.,-1.,-1.]
+    }
+    transform = NumericalMinMaxTransform("test", "test",
+                                        min_val=transform_conf['min_val'],
+                                        transform_conf=transform_conf)
+    info = transform.pre_process(feats)
+    max_val = info["test"][0]
+    min_val = np.array(transform_conf['min_val'])
+    assert_equal(info["test"][0], max_val)
+    transform.update_info([info["test"]])
+    assert_equal(np.array(transform_conf['max_val']),
+                 max_val)
+    assert_equal(np.array(transform_conf['min_val']),
+                 np.array([-1.,-1.,-1.]))
+    result = transform(feats)
+    true_result = (feats - min_val) / (max_val - min_val)
+    true_result[true_result > 1] = 1
+    true_result[true_result < 0] = 0
+    assert_almost_equal(result["test"].astype(input_dtype), true_result)
+
+    transform_conf = {
+        "name": "max_min_norm",
+        "max_val": [1.,1.,1.]
+    }
+    transform = NumericalMinMaxTransform("test", "test",
+                                        max_val=transform_conf['max_val'],
+                                        transform_conf=transform_conf)
+    info = transform.pre_process(feats)
+    max_val = np.array(transform_conf['max_val'])
+    min_val = info["test"][1]
+    assert_equal(info["test"][0], max_val)
+    transform.update_info([info["test"]])
+    assert_equal(np.array(transform_conf['max_val']),
+                 np.array([1.,1.,1.]))
+    assert_equal(np.array(transform_conf['min_val']),
+                 min_val)
+    result = transform(feats)
+    true_result = (feats - min_val) / (max_val - min_val)
+    true_result[true_result > 1] = 1
+    true_result[true_result < 0] = 0
+    assert_almost_equal(result["test"].astype(input_dtype), true_result)
 
 @pytest.mark.parametrize("input_dtype", [np.cfloat, np.float32])
 @pytest.mark.parametrize("out_dtype", [None, np.float16])
diff --git a/tests/unit-tests/test_dataloading.py b/tests/unit-tests/test_dataloading.py
index 5c31efe7c5..1f4d911c8d 100644
--- a/tests/unit-tests/test_dataloading.py
+++ b/tests/unit-tests/test_dataloading.py
@@ -178,8 +178,10 @@ def test_GSgnnEdgeData():
     for etype in tr_etypes:
         assert th.all(tr_data1.test_idxs[etype] == get_nonzero(dist_graph.edges[etype[1]].data['test_mask']))
     assert len(ev_data.test_idxs) == len(va_etypes)
+    assert len(ev_data.infer_idxs) == len(ev_data.test_idxs)
     for etype in va_etypes:
         assert th.all(ev_data.test_idxs[etype] == get_nonzero(dist_graph.edges[etype[1]].data['test_mask']))
+        assert th.all(ev_data.infer_idxs[etype] == get_nonzero(dist_graph.edges[etype[1]].data['test_mask']))
 
     # pass train etypes as None
     assert len(tr_data2.train_idxs) == len(dist_graph.canonical_etypes)
@@ -201,8 +203,10 @@ def test_GSgnnEdgeData():
 
     # pass eval etypes as None
     assert len(ev_data2.test_idxs) == 2
+    assert len(ev_data2.infer_idxs) == 2
     for etype in dist_graph.canonical_etypes:
         assert th.all(ev_data2.test_idxs[etype] == get_nonzero(dist_graph.edges[etype[1]].data['test_mask']))
+        assert th.all(ev_data2.infer_idxs[etype] == get_nonzero(dist_graph.edges[etype[1]].data['test_mask']))
 
     labels = tr_data.get_labels({('n0', 'r1', 'n1'): [0, 1]})
     assert len(labels.keys()) == 1
@@ -273,8 +277,10 @@ def test_GSgnnNodeData():
     for ntype in tr_ntypes:
         assert th.all(tr_data1.test_idxs[ntype] == get_nonzero(dist_graph.nodes[ntype].data['test_mask']))
     assert len(ev_data.test_idxs) == len(va_ntypes)
+    assert len(ev_data.infer_idxs) == len(va_ntypes)
     for ntype in va_ntypes:
         assert th.all(ev_data.test_idxs[ntype] == get_nonzero(dist_graph.nodes[ntype].data['test_mask']))
+        assert th.all(ev_data.infer_idxs[ntype] == get_nonzero(dist_graph.nodes[ntype].data['test_mask']))
 
     labels = tr_data.get_labels({'n1': [0, 1]})
     assert len(labels.keys()) == 1
diff --git a/tests/unit-tests/test_model_save_load.py b/tests/unit-tests/test_model_save_load.py
index 543e180e11..7887c2c304 100644
--- a/tests/unit-tests/test_model_save_load.py
+++ b/tests/unit-tests/test_model_save_load.py
@@ -28,6 +28,7 @@
 from graphstorm.model.utils import save_sparse_embeds
 from graphstorm.model.utils import load_sparse_embeds
 from graphstorm.model.utils import _get_sparse_emb_range
+from graphstorm.model.utils import pad_file_index
 from graphstorm import get_feat_size
 
 from data_utils import generate_dummy_dist_graph
@@ -100,7 +101,7 @@ def check_saved_sparse_emb(mock_get_world_size, mock_get_rank):
                 for i in range(world_size):
                     saved_embs.append(th.load(
                         os.path.join(os.path.join(model_path, ntype),
-                                    f'sparse_emb_{i}.pt')))
+                                    f'sparse_emb_{pad_file_index(i)}.pt')))
                 saved_embs = th.cat(saved_embs, dim=0)
                 assert_equal(saved_embs.numpy(), sparse_embs[ntype].numpy())
         check_saved_sparse_emb()
diff --git a/tests/unit-tests/test_utils.py b/tests/unit-tests/test_utils.py
index f3ad3ec5c8..d8ad6f65be 100644
--- a/tests/unit-tests/test_utils.py
+++ b/tests/unit-tests/test_utils.py
@@ -17,14 +17,18 @@
 import tempfile
 import pytest
 import multiprocessing as mp
+import h5py
 
 import torch as th
 import numpy as np
+import dgl
 from numpy.testing import assert_equal
+from dgl.distributed import DistTensor
 from graphstorm.model.utils import save_embeddings, LazyDistTensor, remove_saved_models, TopKList
 from graphstorm.model.utils import _get_data_range
-from graphstorm.model.utils import _exchange_node_id_mapping
+from graphstorm.model.utils import _exchange_node_id_mapping, distribute_nid_map
 from graphstorm.model.utils import shuffle_predict
+from graphstorm.model.utils import pad_file_index
 from graphstorm.gconstruct.utils import save_maps
 from graphstorm import get_feat_size
 
@@ -32,6 +36,8 @@
 from graphstorm.eval.utils import gen_mrr_score
 from graphstorm.utils import setup_device
 
+from graphstorm.gconstruct.file_io import stream_dist_tensors_to_hdf5
+
 def gen_embedding_with_nid_mapping(num_embs):
     emb = th.rand((num_embs, 12))
     ori_nid_mapping = th.randperm(num_embs)
@@ -123,7 +129,7 @@ def run_dist_exchange_node_id_mapping(worker_rank, world_size, backend,
     assert_equal(target_nid_mapping.numpy(), nid_mapping.cpu().numpy())
 
 @pytest.mark.parametrize("num_embs", [100, 101])
-@pytest.mark.parametrize("backend", ["gloo"])
+@pytest.mark.parametrize("backend", ["gloo", "nccl"])
 def test_exchange_node_id_mapping(num_embs, backend):
     node_id_mapping = th.randperm(num_embs)
     start, end = _get_data_range(0, 4, num_embs)
@@ -157,6 +163,108 @@ def test_exchange_node_id_mapping(num_embs, backend):
     assert p2.exitcode == 0
     assert p3.exitcode == 0
 
+def run_distribute_nid_map(embeddings, local_rank, world_size,
+    node_id_mapping_file, backend, target_nid_mapping):
+    dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
+        master_ip='127.0.0.1', master_port='12345')
+    th.distributed.init_process_group(backend=backend,
+                                      init_method=dist_init_method,
+                                      world_size=world_size,
+                                      rank=local_rank)
+    device = setup_device(local_rank)
+    nid_mapping = distribute_nid_map(embeddings, local_rank, world_size,
+        node_id_mapping_file, device)
+
+    if isinstance(embeddings, (dgl.distributed.DistTensor, LazyDistTensor)):
+        assert_equal(target_nid_mapping[local_rank].numpy(), nid_mapping.cpu().numpy())
+    elif isinstance(embeddings, dict):
+        for name in embeddings.keys():
+            assert_equal(target_nid_mapping[name][local_rank].numpy(), \
+                nid_mapping[name].cpu().numpy())
+
+@pytest.mark.parametrize("backend", ["gloo", "nccl"])
+def test_distribute_nid_map(backend):
+    # need to force to reset the fork context
+    # because dist tensor is the input for mulitiple processes
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        # get the test dummy distributed graph
+        g, _ = generate_dummy_dist_graph(tmpdirname, size="tiny")
+        dummy_dist_embeds = {}
+        ori_nid_maps = {}
+        target_nid_maps = {}
+        for ntype in g.ntypes:
+            dummy_dist_embeds[ntype] = DistTensor((g.number_of_nodes(ntype), 5),
+                      dtype=th.float32, name=f'ntype-{ntype}',
+                      part_policy=g.get_node_partition_policy(ntype))
+            ori_nid_maps[ntype] = th.randperm(g.number_of_nodes(ntype))
+
+            target_nid_maps[ntype] = []
+            _, sorted_nid_map = th.sort(ori_nid_maps[ntype])
+            for i in range(4):
+                start, end = _get_data_range(i, 4, g.number_of_nodes(ntype))
+                target_nid_maps[ntype].append(sorted_nid_map[start:end].clone())
+
+        nid_map_dict_path = os.path.join(tmpdirname, "nid_map_dict.pt")
+        nid_map_tensor_path = os.path.join(tmpdirname, "nid_map_tensor.pt")
+        th.save(ori_nid_maps, nid_map_dict_path)
+        dummy_ntype = g.ntypes[0]
+        th.save(ori_nid_maps[dummy_ntype], nid_map_tensor_path)
+
+        # when dummy_dist_embeds is a dict
+        ctx = mp.get_context('spawn')
+        p0 = ctx.Process(target=run_distribute_nid_map,
+                        args=(dummy_dist_embeds, 0, 4, nid_map_dict_path, backend, \
+                            target_nid_maps))
+        p1 = ctx.Process(target=run_distribute_nid_map,
+                        args=(dummy_dist_embeds, 1, 4, nid_map_dict_path, backend, \
+                            target_nid_maps))
+        p2 = ctx.Process(target=run_distribute_nid_map,
+                        args=(dummy_dist_embeds, 2, 4, nid_map_dict_path, backend, \
+                            target_nid_maps))
+        p3 = ctx.Process(target=run_distribute_nid_map,
+                        args=(dummy_dist_embeds, 3, 4, nid_map_dict_path, backend, \
+                            target_nid_maps))
+        p0.start()
+        p1.start()
+        p2.start()
+        p3.start()
+        p0.join()
+        p1.join()
+        p2.join()
+        p3.join()
+        assert p0.exitcode == 0
+        assert p1.exitcode == 0
+        assert p2.exitcode == 0
+        assert p3.exitcode == 0
+
+        # when dummy_dist_embeds is a dist tensor
+        ctx2 = mp.get_context('spawn')
+        p4 = ctx2.Process(target=run_distribute_nid_map,
+                        args=(dummy_dist_embeds[dummy_ntype], 0, 4, nid_map_tensor_path, \
+                            backend, target_nid_maps[dummy_ntype]))
+        p5 = ctx2.Process(target=run_distribute_nid_map,
+                        args=(dummy_dist_embeds[dummy_ntype], 1, 4, nid_map_tensor_path, \
+                            backend, target_nid_maps[dummy_ntype]))
+        p6 = ctx2.Process(target=run_distribute_nid_map,
+                        args=(dummy_dist_embeds[dummy_ntype], 2, 4, nid_map_tensor_path, \
+                            backend, target_nid_maps[dummy_ntype]))
+        p7 = ctx2.Process(target=run_distribute_nid_map,
+                        args=(dummy_dist_embeds[dummy_ntype], 3, 4, nid_map_tensor_path, \
+                            backend, target_nid_maps[dummy_ntype]))
+
+        p4.start()
+        p5.start()
+        p6.start()
+        p7.start()
+        p4.join()
+        p5.join()
+        p6.join()
+        p7.join()
+        assert p4.exitcode == 0
+        assert p5.exitcode == 0
+        assert p6.exitcode == 0
+        assert p7.exitcode == 0
+
 def run_dist_save_embeddings(model_path, emb, worker_rank,
     world_size, node_id_mapping_file, backend):
     dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
@@ -266,7 +374,7 @@ def test_shuffle_predict(num_embs, backend):
 # TODO: Only test gloo now
 # Will add test for nccl once we enable nccl
 @pytest.mark.parametrize("num_embs", [16, 17])
-@pytest.mark.parametrize("backend", ["gloo"])
+@pytest.mark.parametrize("backend", ["gloo", "nccl"])
 def test_save_embeddings_with_id_mapping(num_embs, backend):
     import tempfile
 
@@ -289,8 +397,8 @@ def test_save_embeddings_with_id_mapping(num_embs, backend):
         assert p1.exitcode == 0
 
         # Load saved embeddings
-        emb0 = th.load(os.path.join(tmpdirname, 'emb.part0.bin'), weights_only=True)
-        emb1 = th.load(os.path.join(tmpdirname, 'emb.part1.bin'), weights_only=True)
+        emb0 = th.load(os.path.join(tmpdirname, f'emb.part{pad_file_index(0)}.bin'), weights_only=True)
+        emb1 = th.load(os.path.join(tmpdirname, f'emb.part{pad_file_index(1)}.bin'), weights_only=True)
         saved_emb = th.cat([emb0, emb1], dim=0)
         assert len(saved_emb) == len(emb)
         assert_equal(emb[nid_mapping].numpy(), saved_emb.numpy())
@@ -329,20 +437,20 @@ def test_save_embeddings_with_id_mapping(num_embs, backend):
         assert p1.exitcode == 0
 
         # Load saved embeddings
-        emb0 = th.load(os.path.join(tmpdirname, 'n0_emb.part0.bin'), weights_only=True)
-        emb1 = th.load(os.path.join(tmpdirname, 'n0_emb.part1.bin'), weights_only=True)
+        emb0 = th.load(os.path.join(tmpdirname, f'n0_emb.part{pad_file_index(0)}.bin'), weights_only=True)
+        emb1 = th.load(os.path.join(tmpdirname, f'n0_emb.part{pad_file_index(1)}.bin'), weights_only=True)
         saved_emb = th.cat([emb0, emb1], dim=0)
         assert len(saved_emb) == len(embs['n0'])
         assert_equal(embs['n0'][nid_mappings['n0']].numpy(), saved_emb.numpy())
 
-        emb0 = th.load(os.path.join(tmpdirname, 'n1_emb.part0.bin'), weights_only=True)
-        emb1 = th.load(os.path.join(tmpdirname, 'n1_emb.part1.bin'), weights_only=True)
+        emb0 = th.load(os.path.join(tmpdirname, f'n1_emb.part{pad_file_index(0)}.bin'), weights_only=True)
+        emb1 = th.load(os.path.join(tmpdirname, f'n1_emb.part{pad_file_index(1)}.bin'), weights_only=True)
         saved_emb = th.cat([emb0, emb1], dim=0)
         assert len(saved_emb) == len(embs['n1'])
         assert_equal(embs['n1'][nid_mappings['n1']].numpy(), saved_emb.numpy())
 
-        emb0 = th.load(os.path.join(tmpdirname, 'n2_emb.part0.bin'), weights_only=True)
-        emb1 = th.load(os.path.join(tmpdirname, 'n2_emb.part1.bin'), weights_only=True)
+        emb0 = th.load(os.path.join(tmpdirname, f'n2_emb.part{pad_file_index(0)}.bin'), weights_only=True)
+        emb1 = th.load(os.path.join(tmpdirname, f'n2_emb.part{pad_file_index(1)}.bin'), weights_only=True)
         saved_emb = th.cat([emb0, emb1], dim=0)
         assert len(saved_emb) == len(embs['n2'])
         assert_equal(embs['n2'][nid_mappings['n2']].numpy(), saved_emb.numpy())
@@ -359,11 +467,11 @@ def test_save_embeddings():
         type0_random_emb, type1_random_emb = helper_save_embedding(tmpdirname)
 
         # Only work with torch 1.13+
-        feats_type0 = [th.load(os.path.join(tmpdirname, "type0_emb.part{}.bin".format(i)),
+        feats_type0 = [th.load(os.path.join(tmpdirname, f"type0_emb.part{pad_file_index(i)}.bin"),
                                weights_only=True) for i in range(4)]
         feats_type0 = th.cat(feats_type0, dim=0)
         # Only work with torch 1.13+
-        feats_type1 = [th.load(os.path.join(tmpdirname, "type1_emb.part{}.bin".format(i)),
+        feats_type1 = [th.load(os.path.join(tmpdirname, f"type1_emb.part{pad_file_index(i)}.bin"),
                                weights_only=True) for i in range(4)]
         feats_type1 = th.cat(feats_type1, dim=0)
 
@@ -463,7 +571,44 @@ def test_gen_mrr_score():
 
     assert th.isclose(metrics['mrr'], metrics_opti['mrr'])  # Default tolerance: 1e-08
 
+def test_stream_dist_tensors_to_hdf5():
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        # get the test dummy distributed graph
+        # medium size has 1,000,000 nodes, which is enough (>chunk_size)
+        g, _ = generate_dummy_dist_graph(tmpdirname, size="medium")
+
+        dummy_dist_embeds = {}
+        for ntype in g.ntypes:
+            dummy_dist_embeds[ntype] = DistTensor((g.number_of_nodes(ntype), 5),
+                      dtype=th.float32, name=f'ntype-{ntype}',
+                      part_policy=g.get_node_partition_policy(ntype))
+
+        # chunk size needs to be smaller than num of nodes
+        chunk_size = g.number_of_nodes(g.ntypes[0]) // 4
+        stream_dist_tensors_to_hdf5(dummy_dist_embeds, os.path.join(tmpdirname, "embed_dict.hdf5"), \
+            chunk_size=chunk_size)
+
+        read_f = h5py.File(os.path.join(tmpdirname, "embed_dict.hdf5"), "r")
+        for ntype in g.ntypes:
+            assert g.number_of_nodes(ntype) == len(read_f[ntype])
+            assert_equal(dummy_dist_embeds[ntype][0:len(dummy_dist_embeds[ntype])].numpy(), \
+                read_f[ntype][0:])
+
+def test_pad_file_index():
+    assert pad_file_index(1) == "00001"
+    assert pad_file_index(111) == "00111"
+    assert pad_file_index(111, 4) == "0111"
+    fail = False
+    try:
+        pad_file_index(111, 0)
+    except:
+        fail = True
+    assert fail
+
 if __name__ == '__main__':
+    test_distribute_nid_map(backend='gloo')
+    test_distribute_nid_map(backend='nccl')
+
     test_shuffle_predict(num_embs=16, backend='gloo')
     test_shuffle_predict(num_embs=17, backend='nccl')
 
@@ -478,3 +623,5 @@ def test_gen_mrr_score():
     test_remove_saved_models()
     test_topklist()
     test_gen_mrr_score()
+
+    test_stream_dist_tensors_to_hdf5()