diff --git a/graphstorm-processing/docs/Makefile b/graphstorm-processing/docs/Makefile
new file mode 100644
index 0000000000..d0c3cbf102
--- /dev/null
+++ b/graphstorm-processing/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/graphstorm-processing/docs/make.bat b/graphstorm-processing/docs/make.bat
new file mode 100644
index 0000000000..6247f7e231
--- /dev/null
+++ b/graphstorm-processing/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/graphstorm-processing/docs/source/conf.py b/graphstorm-processing/docs/source/conf.py
new file mode 100644
index 0000000000..7334ba97ae
--- /dev/null
+++ b/graphstorm-processing/docs/source/conf.py
@@ -0,0 +1,53 @@
+# pylint: skip-file
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'graphstorm-processing'
+copyright = '2023, AGML Team'
+author = 'AGML Team, Amazon'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
diff --git a/graphstorm-processing/docs/source/developer/developer-guide.rst b/graphstorm-processing/docs/source/developer/developer-guide.rst
new file mode 100644
index 0000000000..1a7faf85db
--- /dev/null
+++ b/graphstorm-processing/docs/source/developer/developer-guide.rst
@@ -0,0 +1,230 @@
+Developer Guide
+---------------
+
+The project is set up using ``poetry`` to make easier for developers to
+jump into the project.
+
+The steps we recommend are:
+
+Install JDK 8, 11
+~~~~~~~~~~~~~~~~~
+
+PySpark requires a compatible Java installation to run, so
+you will need to ensure your active JDK is using either
+Java 8 or 11.
+
+On MacOS you can do this using ``brew``:
+
+.. code-block:: bash
+
+    brew install openjdk@11
+
+On Linux it will depend on your distribution's package
+manager. For Ubuntu you can use:
+
+.. code-block:: bash
+
+    sudo apt install openjdk-11-jdk
+
+On Amazon Linux 2 you can use:
+
+.. code-block:: bash
+
+    sudo yum install java-11-amazon-corretto-headless
+    sudo yum install java-11-amazon-corretto-devel
+
+Install ``pyenv``
+~~~~~~~~~~~~~
+
+``pyenv`` is a tool to manage multiple Python version installations. It
+can be installed through the installer below on a Linux machine:
+
+.. code-block:: bash
+
+   curl -L https://github.com/pyenv/pyenv-installer/raw/master/bin/pyenv-installer | bash
+
+or use ``brew`` on a Mac:
+
+.. code-block:: bash
+
+   brew update
+   brew install pyenv
+
+For more info on ``pyenv`` see `its documentation. <https://github.com/pyenv/pyenv>`
+
+Create a Python 3.9 env and activate it.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We use Python 3.9 in our images so this most closely resembles the
+execution environment on our Docker images that will be used for distributed
+training.
+
+.. code-block:: bash
+
+   pyenv install 3.9
+   pyenv global 3.9
+
+..
+
+   Note: We recommend not mixing up ``conda`` and ``pyenv``. When developing for
+   this project, simply ``conda deactivate`` until there's no ``conda``
+   env active (even ``base``) and just rely on ``pyenv`` and ``poetry`` to handle
+   dependencies.
+
+Install ``poetry``
+~~~~~~~~~~~~~~
+
+``poetry`` is a dependency and build management system for Python. To install it
+use:
+
+.. code-block:: bash
+
+   curl -sSL https://install.python-poetry.org | python3 -
+
+Install dependencies through ``poetry``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Now we are ready to install our dependencies through ``poetry``.
+
+We have split the project dependencies into the “main” dependencies that
+``poetry`` installs by default, and the ``dev`` dependency group that
+installs that dependencies that are only needed to develop the library.
+
+**On a POSIX system** (tested on Ubuntu, CentOS, MacOS) run:
+
+.. code-block:: bash
+
+   # Install all dependencies into local .venv
+   poetry install --with dev
+
+Once all dependencies are installed you should be able to run the unit
+tests for the project and continue with development using:
+
+.. code-block:: bash
+
+   poetry run pytest ./graphstorm-processing/tests
+
+You can also activate and use the virtual environment using:
+
+.. code-block:: bash
+
+   poetry shell
+   # We're now using the graphstorm-processing-py3.9 env so we can just run
+   pytest ./graphstorm-processing/tests
+
+To learn more about ``poetry`` see its `documentation <https://python-poetry.org/docs/basic-usage/>`_
+
+Use ``black`` to format code [optional]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We use `black <https://black.readthedocs.io/en/stable/index.html>`_ to
+format code in this project. ``black`` is an opinionated formatter that
+helps speed up development and code reviews. It is included in our
+``dev`` dependencies so it will be installed along with the other dev
+dependencies.
+
+To use ``black`` in the project you can run (from the project's root,
+same level as ``pyproject.toml``)
+
+.. code-block:: bash
+
+   # From the project's root directory, graphstorm-processing run:
+   black .
+
+To get a preview of the changes ``black`` would make you can use:
+
+.. code-block:: bash
+
+   black . --diff --color
+
+You can auto-formatting with ``black`` to VSCode using the `Black
+Formatter <https://marketplace.visualstudio.com/items?itemName=ms-python.black-formatter>`__
+
+
+Use mypy and pylint to lint code
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We include the ``mypy`` and ``pylint`` linters as a dependency under the ``dev`` group
+of dependencies. These linters perform static checks on your code and
+can be used in a complimentary manner.
+
+We recommend `using VSCode and enabling the mypy linter <https://code.visualstudio.com/docs/python/linting#_general-settings>`_
+to get in-editor annotations.
+
+You can also lint the project code through:
+
+.. code-block:: bash
+
+   poetry run mypy ./graphstorm_processing
+
+To learn more about ``mypy`` and how it can help development
+`see its documentation <https://mypy.readthedocs.io/en/stable/>`_.
+
+
+Our goal is to minimize ``mypy`` errors as much as possible for the
+project. New code should be linted and not introduce additional mypy
+errors. When necessary it's OK to use ``type: ignore`` to silence
+``mypy`` errors inline, but this should be used sparingly.
+
+As a project, GraphStorm requires a 10/10 pylint score, so
+ensure your code conforms to the expectation by running
+
+.. code-block:: bash
+
+    pylint --rcfile=/path/to/graphstorm/tests/lint/pylintrc
+
+on your code before commits. To make this easier we include
+a pre-commit hook below.
+
+Use a pre-commit hook to ensure ``black`` and ``pylint`` runs before commits
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To make code formatting and ``pylint`` checks easier for graphstorm-processing
+developers, we recommend using a pre-commit hook.
+
+We include ``pre-commit`` in the project's ``dev`` dependencies, so once
+you have activated the project's venv (``poetry shell``) you can just
+create a file named ``.pre-commit-config.yaml`` with the following contents:
+
+.. code-block:: yaml
+
+    # .pre-commit-config.yaml
+    repos:
+        - repo: https://github.com/psf/black
+            rev: 23.7.0
+            hooks:
+            - id: black
+                language_version: python3.9
+                files: 'graphstorm_processing\/.*\.pyi?$|tests\/.*\.pyi?$|scripts\/.*\.pyi?$'
+                exclude: 'python\/.*\.pyi'
+        - repo: local
+            hooks:
+            - id: pylint
+                name: pylint
+                entry: pylint
+                language: system
+                types: [python]
+                args:
+                [
+                    "--rcfile=./tests/lint/pylintrc"
+                ]
+
+
+And then run:
+
+.. code-block:: bash
+
+   pre-commit install
+
+which will install the ``black`` and ``pylin`` hooks into your local repository and
+ensure it runs before every commit.
+
+.. note::
+
+    The pre-commit hook will also apply to all commits you make to the root
+    GraphStorm repository. Since that Graphstorm doesn't use ``black``, you might
+    want to remove the hooks. You can do so from the root repo
+    using ``rm -rf .git/hooks``.
+
+    Both projects use ``pylint`` to check Python files so we'd still recommend using
+    that hook even if you're doing development for both GSProcessing and GraphStorm.
diff --git a/graphstorm-processing/docs/source/developer/input-configuration.rst b/graphstorm-processing/docs/source/developer/input-configuration.rst
new file mode 100644
index 0000000000..e6e2d7ae98
--- /dev/null
+++ b/graphstorm-processing/docs/source/developer/input-configuration.rst
@@ -0,0 +1,430 @@
+..  _input-configuration:
+
+GraphStorm Processing Input Configuration
+=========================================
+
+GraphStorm Processing uses a JSON configuration file to
+parse and process the data into the format needed
+by GraphStorm partitioning and training downstream.
+
+We use this configuration format as an intermediate
+between other config formats, such as the one used
+by the single-machine GConstruct module.
+
+GSProcessing can take a GConstruct-formatted file
+directly, and we also provide `a script <https://github.com/awslabs/graphstorm/blob/main/graphstorm-processing/scripts/convert_gconstruct_config.py>`
+that can convert a `GConstruct <https://graphstorm.readthedocs.io/en/latest/configuration/configuration-gconstruction.html#configuration-json-explanations>`
+input configuration file into the ``GSProcessing`` format,
+although this is mostly aimed at developers, users are
+can rely on the automatic conversion.
+
+The GSProcessing input data configuration has two top-level objects:
+
+.. code-block:: json
+
+   {
+     "version": "gsprocessing-v1.0",
+     "graph": {}
+   }
+
+-  ``version`` (String, required): The version of configuration file being used. We include
+   the package name to allow self-contained identification of the file format.
+-  ``graph`` (JSON object, required): one configuration object that defines each
+   of the node types and edge types that describe the graph.
+
+We describe the ``graph`` object next.
+
+``graph`` configuration object
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``graph`` configuration object can have two top-level objects:
+
+.. code-block:: json
+
+   {
+     "edges": [{}],
+     "nodes": [{}]
+   }
+
+-  ``edges``: (array of JSON objects, required). Each JSON object
+   in this array describes one edge type and determines how the edge
+   structure will be parsed.
+-  ``nodes``: (array of JSON objects, optional). Each JSON object
+   in this array describes one node type. This key is optional, in case
+   it is missing, node IDs are derived from the ``edges`` objects.
+
+--------------
+
+Contents of an ``edges`` configuration object
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+An ``edges`` configuration object can contain the following top-level
+objects:
+
+.. code-block:: json
+
+   {
+     "data": {
+       "format": "String",
+       "files": ["String"],
+       "separator": "String"
+     },
+     "source": {"column": "String", "type": "String"},
+     "relation": {"type": "String"},
+     "destination": {"column": "String", "type": "String"},
+     "labels" : [
+            {
+                "column": "String",
+                "type": "String",
+                "split_rate": {
+                    "train": "Float",
+                    "val": "Float",
+                    "test": "Float"
+                }
+            },
+      ]
+       "features": [{}]
+   }
+
+-  ``data`` (JSON Object, required): Describes the physical files
+   that store the data described in this object. The JSON object has two
+   top level objects:
+
+   -  ``format`` (String, required): indicates the format the data is
+      stored in. We accept either ``"csv"`` or ``"parquet"`` as valid
+      file formats.
+
+   -  ``files`` (array of String, required): the physical location of
+      files. The format accepts two options:
+
+      -  a single-element list a with directory-like (ending in ``/``)
+         **relative** path under which all the files that correspond to
+         the current edge type are stored.
+
+         -  e.g. ``"files": ['path/to/edge/type/']``
+         -  This option allows for concise listing of entire types and
+            would be preferred. All the files under the path will be loaded.
+
+      -  a multi-element list of **relative** file paths.
+
+         -  ``"files": ['path/to/edge/type/file_1.csv', 'path/to/edge/type/file_2.csv']``
+         -  This option allows for multiple types to be stored under the
+            same input prefix, but will result in more verbose spec
+            files.
+
+      -  Since the spec expects **relative paths**, the caller is
+         responsible for providing a path prefix to the execution
+         engine. The prefix will determine if the source is a local
+         filesystem or S3, allowing the spec to be portable, i.e. a user
+         can move the physical files and the spec will still be valid,
+         as long as the relative structure is kept.
+
+   -  ``separator`` (String, optional): Only relevant for CSV files,
+      determines the separator used between each column in the files.
+
+-  ``source``: (JSON object, required): Describes the source nodes
+   for the edge type. The top-level keys for the object are:
+
+   -  ``column``: (String, required) The name of the column in the
+      physical data files.
+   -  ``type``: (String, optional) The type name of the nodes. If not
+      provided, we assume that the column name is the type name.
+
+-  ``destination``: (JSON object, required): Describes the
+   destination nodes for the edge type. Its format is the same as the
+   ``source`` key, with a JSON object that contains
+   ``{“column: String, and ”type“: String}``.
+-  ``relation``: (JSON object, required): Describes the relation
+   modeled by the edges. A relation can be common among all edges, or it
+   can have sub-types. The top-level objects for the object are:
+
+   -  ``type`` (String, required): The type of the relation described by
+      the edges. For example, for a source type ``user``, destination
+      ``movie`` we can have a relation type ``interacted_with`` for an
+      edge type ``user:interacted_with:movie``.
+
+-  ``labels`` (List of JSON objects, optional): Describes the label
+   for the current edge type. The label object has the following
+   top-level objects:
+
+   -  ``column`` (String, required): The column that contains the values
+      for the label. Should be the empty string, ``""`` if the ``type``
+      key has the value ``"link_prediction"``.
+   -  ``type`` (String, required): The type of the learning task. Can
+      take the following String values:
+
+      -  ``“classification”``: An edge classification task. The values
+         in the specified ``column`` as treated as categorical
+         variables.
+      -  ``"regression"``: An edge regression task. The values in the
+         specified ``column`` are treated as numerical values.
+      -  ``"link_prediction"``: A link prediction tasks. The ``column``
+         should be ``""`` in this case.
+
+   -  ``separator``: (String, optional): For multi-label classification
+      tasks, this separator is used within the column to list multiple
+      classification labels in one entry.
+   -  ``split_rate`` (JSON object, optional): Defines a split rate
+      for the label items. The sum of the values for ``train``, ``val`` and
+      ``test`` needs to be 1.0.
+
+      -  ``train``: The percentage of the data with available labels to
+         assign to the train set (0.0, 1.0].
+      -  ``val``: The percentage of the data with available labels to
+         assign to the train set [0.0, 1.0).
+      -  ``test``: The percentage of the data with available labels to
+         assign to the train set [0.0, 1.0).
+
+-  ``features`` (List of JSON objects, optional)\ **:** Describes
+   the set of features for the current edge type. See the :ref:`features-object` section for details.
+
+--------------
+
+Contents of a ``nodes`` configuration object
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A node configuration object in a ``nodes`` field can contain the
+following top-level keys:
+
+.. code-block:: json
+
+    {
+        "data": {
+            "format": "String",
+            "files": ["String"],
+            "separator": "String"
+        },
+        "column" : "String",
+        "type" : "String",
+        "labels" : [
+            {
+                "column": "String",
+                "type": "String",
+                "separator": "String",
+                "split_rate": {
+                    "train": "Float",
+                    "val": "Float",
+                    "test": "Float"
+                }
+            }
+        ],
+        "features": [{}]
+    }
+
+-  ``data``: (JSON object, required): Has the same definition as for
+   the edges object, with one top-level key for the ``format`` that
+   takes a String value, and one for the ``files`` that takes an array
+   of String values.
+-  ``column``: (String, required): The column in the data that
+   corresponds to the column that stores the node ids.
+-  ``type:`` (String, optional): A type name for the nodes described
+   in this object. If not provided the ``column`` value is used as the
+   node type.
+-  ``labels``: (List of JSON objects, optional): Similar to the
+   labels object defined for edges, but the values that the ``type`` can
+   take are different.
+
+   -  ``column`` (String, required): The name of the column that
+      contains the label values.
+   -  ``type`` (String, required): Specifies that target task type which
+      can be:
+
+      -  ``"classification"``: A node classification task. The values in the specified
+         ``column`` are treated as categorical variables.
+      -  ``"regression"``: A node regression task. The values in the specified
+         ``column`` are treated as float values.
+
+   -  ``separator`` (String, optional): For multi-label
+      classification tasks, this separator is used within the column to
+      list multiple classification labels in one entry.
+
+      -  e.g. with separator ``|`` we can have ``action|comedy`` as a
+         label value.
+
+   -  ``split_rate`` (JSON object, optional): Defines a split rate
+      for the label items. The sum of the values for ``train``, ``val`` and
+      ``test`` needs to be 1.0.
+
+      -  ``train``: The percentage of the data with available labels to
+         assign to the train set (0.0, 1.0].
+      -  ``val``: The percentage of the data with available labels to
+         assign to the train set [0.0, 1.0).
+      -  ``test``: The percentage of the data with available labels to
+         assign to the train set [0.0, 1.0).
+
+-  ``features`` (List of JSON objects, optional): Describes
+   the set of features for the current edge type. See the next section, :ref:`features-object`
+   for details.
+
+--------------
+
+.. _features-object:
+
+Contents of a ``features`` configuration object
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+An element of a ``features`` configuration object (for edges or nodes)
+can contain the following top-level keys:
+
+.. code-block:: json
+
+    {
+        "column": "String",
+        "name": "String",
+        "transformation": {
+        "name": "String",
+        "kwargs": {
+            "arg_name": "<value>"
+        }
+        },
+        "data": {
+            "format": "String",
+            "files": ["String"],
+            "separator": "String"
+        }
+    }
+
+-  ``column`` (String, required): The column that contains the raw
+   feature values in the dataset
+-  ``transformation`` (JSON object, optional): The type of
+   transformation that will be applied to the feature. For details on
+   the individual transformations supported see :ref:`supported-transformations`.
+   If this key is missing, the feature is treated as
+   a **no-op** feature without ``kwargs``.
+
+   -  ``name`` (String, required): The name of the transformation to be
+      applied.
+   -  ``kwargs`` (JSON object, optional): A dictionary of parameter
+      names and values. Each individual transformation will have its own
+      supported parameters, described in :ref:`supported-transformations`.
+
+-  ``name`` (String, optional): The name that will be given to the
+   encoded feature. If not given, **column** is used as the output name.
+-  ``data`` (JSON object, optional): If the data for the feature
+   exist in a file source that's different from the rest of the data of
+   the node/edge type, they are provided here. For example, you could
+   have each feature in one file source each:
+
+   .. code-block:: python
+
+        # Example node config with multiple features
+        {
+            # This is where the node structure data exist just need an id col
+            "data": {
+                "format": "parquet",
+                "files": ["path/to/node_ids"]
+            },
+            "column" : "node_id",
+            "type" : "my_node_type",
+            "features": [
+                # Feature 1
+                {
+                    "column": "feature_one",
+                    # The files contain one "node_id" col and one "feature_one" col
+                    "data": {
+                        "format": "parquet",
+                        "files": ["path/to/feature_one/"]
+                    }
+                },
+                # Feature 2
+                {
+                    "column": "feature_two",
+                    # The files contain one "node_id" col and one "feature_two" col
+                    "data": {
+                        "format": "parquet",
+                        "files": ["path/to/feature_two/"]
+                    }
+                }
+            ]
+        }
+
+
+   **The file source needs
+   to contain the column names of the parent node/edge type to allow a
+   1-1 mapping between the structure and feature files.**
+
+   For nodes the
+   the feature files need to have one column named with the node id column
+   name, (the value of ``"column"`` for the parent node type),
+   for edges we need both the ``source`` and
+   ``destination`` columns to use as a composite key.
+
+.. _supported-transformations:
+
+Supported transformations
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In this section we'll describe the transformations we support.
+The name of the transformation is the value that would appear
+in the ``transform['name']`` element of the feature configuration,
+with the attached ``kwargs`` for the transformations that support
+arguments.
+
+-  ``no-op``
+
+   -  Passes along the data as-is to be written to storage and
+      used in the partitioning pipeline. The data are assumed to be single
+      values or vectors of floats.
+   -  ``kwargs``:
+
+      -  ``separator`` (String, optional): Only relevant for CSV file
+         sources, when a separator is used to encode vector feature
+         values into one column. If given, the separator will be used to
+         split the values in the column and create a vector column
+         output. Example: for a separator ``'|'`` the CSV value
+         ``1|2|3`` would be transformed to a vector, ``[1, 2, 3]``.
+
+--------------
+
+Examples
+~~~~~~~~
+
+OAG-Paper dataset
+-----------------
+
+.. code-block:: json
+
+    {
+        "version" : "gsprocessing-v1.0",
+        "graph" : {
+            "edges" : [
+                {
+                "data": {
+                    "format": "csv",
+                    "files": [
+                        "edges.csv"
+                    ],
+                    "separator": ","
+                },
+                "source": {"column": "~from", "type": "paper"},
+                "dest": {"column": "~to", "type": "paper"},
+                "relation": {"type": "cites"}
+                }
+            ],
+            "nodes" : [
+                {
+                    "data": {
+                        "format": "csv",
+                        "separator": ",",
+                        "files": [
+                            "node_feat.csv"
+                        ]
+                    },
+                    "type": "paper",
+                    "column": "ID",
+                    "labels": [
+                        {
+                            "column": "field",
+                            "type": "classification",
+                            "separator": ";",
+                            "split_rate": {
+                                "train": 0.7,
+                                "val": 0.1,
+                                "test": 0.2
+                            }
+                        }
+                    ]
+                }
+            ]
+        }
+    }
diff --git a/graphstorm-processing/docs/source/index.rst b/graphstorm-processing/docs/source/index.rst
new file mode 100644
index 0000000000..cc027cbb08
--- /dev/null
+++ b/graphstorm-processing/docs/source/index.rst
@@ -0,0 +1,154 @@
+.. graphstorm-processing documentation master file, created by
+   sphinx-quickstart on Tue Aug  1 02:04:45 2023.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to GraphStorm Distributed Data Processing documentation!
+=================================================
+
+.. toctree::
+    :maxdepth: 1
+    :caption: Contents:
+
+    Example <usage/example>
+    Distributed processing setup <usage/distributed-processing-setup>
+    Running on Amazon Sagemaker <usage/amazon-sagemaker>
+    Developer Guide <developer/developer-guide>
+    Input configuration <developer/input-configuration>
+
+
+GraphStorm Distributed Data Processing allows you to process and prepare massive graph data
+for training with GraphStorm. GraphStorm Processing takes care of generating
+unique ids for nodes, using them to encode edge structure files, process
+individual features and prepare the data to be passed into the
+distributed partitioning and training pipeline of GraphStorm.
+
+We use PySpark to achieve
+horizontal parallelism, allowing us to scale to graphs with billions of nodes
+and edges.
+
+.. _installation-ref:
+
+Installation
+------------
+
+The project uses Python 3.9. We recommend using `PyEnv <https://github.com/pyenv/pyenv>`_
+to have isolated Python installations.
+
+With PyEnv installed you can create and activate a Python 3.9 environment using
+
+.. code-block:: bash
+
+    pyenv install 3.9
+    pyenv local 3.9
+
+
+With a recent version of ``pip`` installed (we recommend ``pip>=21.3``), you can simply run ``pip install .``
+from the root directory of the project (``graphstorm/graphstorm-processing``),
+which should install the library into your environment and pull in all dependencies.
+
+Install Java 8, 11, or 17
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Spark has a runtime dependency on the JVM to run, so you'll need to ensure
+Java is installed and available on your system.
+
+On MacOS you can install Java using ``brew``:
+
+.. code-block:: bash
+
+    brew install openjdk@11
+
+On Linux it will depend on your distribution's package
+manager. For Ubuntu you can use:
+
+.. code-block:: bash
+
+    sudo apt install openjdk-11-jdk
+
+On Amazon Linux 2 you can use:
+
+.. code-block:: bash
+
+    sudo yum install java-11-amazon-corretto-headless
+    sudo yum install java-11-amazon-corretto-devel
+
+To check if Java is installed you can use.
+
+.. code-block:: bash
+
+    java -version
+
+
+Example
+-------
+
+See the provided :doc:`usage/example` for an example of how to start with tabular
+data and convert them into a graph representation before partitioning and
+training with GraphStorm.
+
+Usage
+-----
+
+To use the library to process your data, you will need to have your data
+in a tabular format, and a corresponding JSON configuration file that describes the
+data. The input data can be in CSV (with header(s)) or Parquet format.
+
+The configuration file can be in GraphStorm's GConstruct format,
+with the caveat that the file paths need to be relative to the
+location of the config file. See :doc:`/usage/example` for more details.
+
+After installing the library, executing a processing job locally can be done using:
+
+.. code-block:: bash
+
+    gs-processing \
+        --config-filename gconstruct-config.json \
+        --input-prefix /path/to/input/data \
+        --output-prefix /path/to/output/data
+
+Once the processing engine has processed the data, we want to ensure
+they match the requirements of the DGL distributed partitioning
+pipeline, so we need to run an additional script that will
+make sure the produced data matches the assumptions of DGL [#f1]_.
+
+.. note::
+
+    Ensure you pass the output path of the previous step as the input path here.
+
+.. code-block:: bash
+
+    gs-repartition \
+        --input-prefix /path/to/output/data
+
+Once this script completes, the data are ready to be fed into DGL's distributed
+partitioning pipeline.
+See `this guide <https://github.com/awslabs/graphstorm/blob/main/sagemaker/README.md#launch-graph-partitioning-task>`_
+for more details on how to use GraphStorm distributed partitioning on SageMaker.
+
+See :doc:`/usage/example` for a detailed walkthrough of using GSProcessing to
+wrangle data into a format that's ready to be consumed by the GraphStorm/DGL
+partitioning pipeline.
+
+
+Using with Amazon SageMaker
+---------------------------
+
+To run distributed jobs on Amazon SageMaker we will have to build a Docker image
+and push it to the Amazon Elastic Container Registry, which we cover in
+:doc:`usage/distributed-processing-setup` and run a SageMaker Processing
+job which we describe in :doc:`/usage/amazon-sagemaker`.
+
+
+Developer guide
+---------------
+
+To get started with developing the package refer to :doc:`/developer/developer-guide`.
+
+
+.. rubric:: Footnotes
+
+.. [#f1] DGL expects that every file produced for a single node/edge type
+    has matching row counts, which is something that Spark cannot guarantee.
+    We use the re-partitioning script to fix this where needed in the produced
+    output.
\ No newline at end of file
diff --git a/graphstorm-processing/docs/source/usage/amazon-sagemaker.rst b/graphstorm-processing/docs/source/usage/amazon-sagemaker.rst
new file mode 100644
index 0000000000..53fe61c922
--- /dev/null
+++ b/graphstorm-processing/docs/source/usage/amazon-sagemaker.rst
@@ -0,0 +1,154 @@
+Running distributed jobs on Amazon SageMaker
+============================================
+
+Once the :doc:`distributed processing setup <distributed-processing-setup>` is complete, we can
+use the Amazon SageMaker launch scripts to launch distributed processing
+jobs that use AWS resources.
+
+To demonstrate the usage of GSProcessing on Amazon SageMaker, we will execute the same job we used in our local
+execution example, but this time use Amazon SageMaker to provide the compute resources instead of our
+local machine.
+
+Upload data to S3
+-----------------
+
+Amazon SageMaker uses S3 as its storage target, so before starting
+we'll need to upload our test data to S3. To do so you will need
+to have read/write access to an S3 bucket, and the requisite AWS credentials
+and permissions.
+
+We will use the AWS CLI to upload data so make sure it is
+`installed <https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html>`_
+and `configured <https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html>`_
+in you local environment.
+
+Assuming ``graphstorm/graphstorm-processing`` is our current working
+directory we can upload the test data to S3 using:
+
+.. code-block:: bash
+
+    MY_BUCKET="enter-your-bucket-name-here"
+    REGION="bucket-region" # e.g. us-west-2
+    aws --region ${REGION} s3 sync ./tests/resources/small_heterogeneous_graph/ \
+        "${MY_BUCKET}/gsprocessing-input"
+
+.. note::
+
+    Make sure you are uploading your data to a bucket
+    that was created in the same region as the ECR image
+    you pushed in :doc:`/usage/distributed-processing-setup`.
+
+
+Launch the GSProcessing job on Amazon SageMaker
+-----------------------------------------------
+
+Once the data are uploaded to S3, we can use the Python script
+``graphstorm-processing/scripts/run_distributed_processing.py``
+to run a GSProcessing job on Amazon SageMaker.
+
+For this example we'll use a SageMaker Spark cluster with 2 ``ml.t3.xlarge`` instances
+since this is a tiny dataset. Using SageMaker you'll be able to create clusters
+of up to 20 instances, allowing you to scale your processing to massive graphs,
+using larger instances like `ml.r5.24xlarge`.
+
+Since we're now executing on AWS, we'll need access to an execution role
+for SageMaker and the ECR image URI we created in :doc:`/usage/distributed-processing-setup`.
+For instructions on how to create an execution role for SageMaker
+see the `AWS SageMaker documentation <https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html#sagemaker-roles-create-execution-role>`_.
+
+Let's set up a small bash script that will run the parametrized processing
+job, followed by the re-partitioning job, both on SageMaker
+
+.. code-block:: bash
+
+    ACCOUNT="enter-your-account-id-here" # e.g 1234567890
+    MY_BUCKET="enter-your-bucket-name-here"
+    SAGEMAKER_ROLE_NAME="enter-your-sagemaker-execution-role-name-here"
+    REGION="bucket-region" # e.g. us-west-2
+    DATASET_S3_PATH="s3://${MY_BUCKET}/gsprocessing-input"
+    OUTPUT_BUCKET=${MY_BUCKET}
+    DATASET_NAME="small-graph"
+    CONFIG_FILE="gconstruct-config.json"
+    INSTANCE_COUNT="2"
+    INSTANCE_TYPE="ml.t3.xlarge"
+    NUM_FILES="4"
+
+    IMAGE_URI="${ACCOUNT}.dkr.ecr.${REGION}.amazonaws.com/graphstorm-processing:0.1.0"
+    ROLE="arn:aws:iam::${ACCOUNT}:role/service-role/${SAGEMAKER_ROLE_NAME}"
+
+    OUTPUT_PREFIX="s3://${OUTPUT_BUCKET}/gsprocessing/${DATASET_NAME}/${INSTANCE_COUNT}x-${INSTANCE_TYPE}-${NUM_FILES}files/"
+
+    # Conditionally delete data at output
+    echo "Delete all data under output path? ${OUTPUT_PREFIX}"
+    select yn in "Yes" "No"; do
+        case $yn in
+            Yes ) aws s3 rm --recursive ${OUTPUT_PREFIX} --quiet; break;;
+            No ) break;;
+        esac
+    done
+
+    # This will run and block until the GSProcessing job is done
+    python scripts/run_distributed_processing.py \
+        --s3-input-prefix ${DATASET_S3_PATH} \
+        --s3-output-prefix ${OUTPUT_PREFIX} \
+        --role ${ROLE} \
+        --image ${IMAGE_URI} \
+        --region ${REGION} \
+        --config-filename ${CONFIG_FILE} \
+        --instance-count ${INSTANCE_COUNT} \
+        --instance-type ${INSTANCE_TYPE} \
+        --job-name "${DATASET_NAME}-${INSTANCE_COUNT}x-${INSTANCE_TYPE//./-}-${NUM_FILES}files" \
+        --num-output-files ${NUM_FILES} \
+        --wait-for-job
+
+    # This will run the follow-up re-partitioning job
+    python scripts/run_repartitioning.py --s3-input-prefix ${OUTPUT_PREFIX} \
+        --role ${ROLE} --image ${IMAGE_URI}  --config-filename "metadata.json" \
+        --instance-type ${INSTANCE_TYPE} --wait-for-job
+
+
+.. note::
+
+    The re-partitioning job runs on a single instance, so for large graphs you will
+    want to scale up to an instance with more memory to avoid memory errors. `ml.r5` instances
+    should allow you to re-partition graph data with billions of nodes and edges.
+
+The ``--num-output-files`` parameter
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You can see that we provided a parameter named
+``--num-output-files`` to ``run_distributed_processing.py``. This is an
+important parameter, as it provides a hint to set the parallelism for Spark.
+
+It can safely be skipped and let Spark decide the proper value based on the cluster's
+instance type and count. If setting it yourself a good value to use is
+``num_instances * num_cores_per_instance * 2``, which will ensure good
+utilization of the cluster resources.
+
+
+Examine the output
+------------------
+
+Once both jobs are finished we can examine the output created, which
+should match the output we saw when running the same jobs locally
+in :doc:`/usage/example`:
+
+
+.. code-block:: bash
+
+    $ aws s3 ls ${OUTPUT_PREFIX}
+
+                               PRE edges/
+                               PRE node_data/
+                               PRE node_id_mappings/
+    2023-08-05 00:47:36        804 launch_arguments.json
+    2023-08-05 00:47:36      11914 metadata.json
+    2023-08-05 00:47:37        545 perf_counters.json
+    2023-08-05 00:47:37      12082 updated_row_counts_metadata.json
+
+Run distributed partitioning and training on Amazon SageMaker
+-------------------------------------------------------------
+
+With the data now processed you can follow the
+`GraphStorm Amazon SageMaker guide <https://github.com/awslabs/graphstorm/tree/main/sagemaker#launch-graph-partitioning-task>`_
+to partition your data and run training on AWS.
diff --git a/graphstorm-processing/docs/source/usage/distributed-processing-setup.rst b/graphstorm-processing/docs/source/usage/distributed-processing-setup.rst
new file mode 100644
index 0000000000..785dd5a514
--- /dev/null
+++ b/graphstorm-processing/docs/source/usage/distributed-processing-setup.rst
@@ -0,0 +1,136 @@
+Distributed Processing setup for Amazon SageMaker
+=================================================
+
+In this guide we'll demonstrate how to prepare your environment to run
+GraphStorm Processing (GSP) jobs on Amazon SageMaker.
+
+We're assuming a Linux host environment used throughout
+this tutorial, but other OS should work fine as well.
+
+The steps required are:
+
+- Clone the GraphStorm repository.
+- Install Docker.
+- Install Poetry.
+- Set up AWS access.
+- Build the GraphStorm Processing image using Docker.
+- Push the image to the Amazon Elastic Container Registry (ECR).
+- Launch a SageMaker Processing job using the example scripts.
+
+Clone the GraphStorm repository
+-------------------------------
+
+You can clone the GraphStorm repository using
+
+.. code-block:: bash
+
+    git clone https://github.com/awslabs/graphstorm.git
+
+You can then navigate to the ``graphstorm-processing/docker`` directory
+that contains the relevant code:
+
+.. code-block:: bash
+
+    cd ./graphstorm/graphstorm-processing/docker
+
+Install Docker
+--------------
+
+To get started with building the GraphStorm Processing image
+you'll need to have the Docker engine installed.
+
+
+To install Docker follow the instructions at the
+`official site <https://docs.docker.com/engine/install/>`_.
+
+Install Poetry
+--------------
+
+We use `Poetry <https://python-poetry.org/docs/>`_ as our build
+tool and for dependency management,
+so we need to install it to facilitate building the library.
+
+You can install Poetry using:
+
+.. code-block:: bash
+
+    curl -sSL https://install.python-poetry.org | python3 -
+
+For detailed installation instructions the
+`Poetry docs <https://python-poetry.org/docs/>`_.
+
+
+Set up AWS access
+-----------------
+
+To build and push the image to ECR we'll make use of the
+``aws-cli`` and we'll need valid AWS credentials as well.
+
+To install the AWS CLI you can use:
+
+.. code-block:: bash
+
+    curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
+    unzip awscliv2.zip
+    sudo ./aws/install
+
+To set up credentials for use with ``aws-cli`` see the
+`AWS docs <https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html#cli-configure-files-examples>`_.
+
+Your role should have full ECR access to be able to pull from ECR to build the image,
+create an ECR repository if it doesn't exist, and push the GSProcessing image to the repository.
+
+Building the GraphStorm Processing image using Docker
+-----------------------------------------------------
+
+Once Docker and Poetry are installed, and your AWS credentials are set up,
+we can use the provided scripts
+in the ``graphstorm-processing/docker`` directory to build the image.
+
+The ``build_gsprocessing_image.sh`` script can build the image
+locally and tag it. For example, assuming our current directory is where
+we cloned ``graphstorm/graphstorm-processing``:
+
+.. code-block:: bash
+
+    bash docker/build_gsprocessing_image.sh
+
+The above will use the Dockerfile of the latest available GSProcessing version,
+build an image and tag it as ``graphstorm-processing:${VERSION}`` where
+``${VERSION}`` will take be the latest available GSProcessing version (e.g. ``0.1.0``).
+
+The script also supports other arguments to customize the image name,
+tag and other aspects of the build. See ``bash docker/build_gsprocessing_image.sh --help``
+for more information.
+
+Push the image to the Amazon Elastic Container Registry (ECR)
+-------------------------------------------------------------
+
+Once the image is built we can use the ``push_gsprocessing_image.sh`` script
+that will create an ECR repository if needed and push the image we just built.
+
+The script does not require any arguments and by default will
+create a repository named ``graphstorm-processing`` in the ``us-west-2`` region,
+on the default AWS account ``aws-cli`` is configured for,
+and push the image tagged with the latest version of GSProcessing.
+
+The script supports 4 optional arguments:
+
+1. Image name/repository. (``-i/--image``) Default: ``graphstorm-processing``
+2. Image tag. Default: (``-v/--version``) ``<latest_library_version>`` e.g. ``0.1.0``.
+3. ECR region. Default: (``-r/--region``) ``us-west-2``.
+4. AWS Account ID. (``-a/--account``) Default: Uses the account ID detected by the ``aws-cli``.
+
+Example:
+
+.. code-block:: bash
+
+    bash push_gsprocessing_image.sh -i "graphstorm-processing" -v "0.1.0" -r "us-west-2" -a "1234567890"
+
+
+Launch a SageMaker Processing job using the example scripts.
+------------------------------------------------------------
+
+Once the setup is complete, you can follow the
+:doc:`SageMaker Processing job guide <amazon-sagemaker>`
+to launch your distributed processing job using AWS resources.
diff --git a/graphstorm-processing/docs/source/usage/example.rst b/graphstorm-processing/docs/source/usage/example.rst
new file mode 100644
index 0000000000..ab25b5a1f1
--- /dev/null
+++ b/graphstorm-processing/docs/source/usage/example.rst
@@ -0,0 +1,268 @@
+GraphStorm Processing example
+=============================
+
+To demonstrate how to use the library locally we will
+use the same example data as we use in our
+unit tests, which you can find in the project's repository,
+under ``graphstorm/graphstorm-processing/tests/resources/small_heterogeneous_graph``.
+
+Install example dependencies
+----------------------------
+
+To run the local example you will need to install the GSProcessing
+library to your Python environment, and you'll need to clone the
+GraphStorm repository to get access to the data.
+
+Follow the :ref:`installation-ref` guide to install the GSProcessing library.
+
+You can clone the repository using
+
+.. code-block:: bash
+
+    git clone https://github.com/awslabs/graphstorm.git
+
+You can then navigate to the ``graphstorm-processing/`` directory
+that contains the relevant data:
+
+.. code-block:: bash
+
+    cd ./graphstorm/graphstorm-processing/
+
+
+Expected file inputs and configuration
+--------------------------------------
+
+GSProcessing expects the input files to be in specific format that will allow
+us to perform the processing and prepare the data for partitioning and training.
+
+The data files are expected to be:
+
+* Tabular data files. We support CSV-with-header format, or in Parquet format.
+  The files can be split (multiple parts), or a single file.
+* Available on a local file system or on S3.
+* One tabular file source per edge and node type. For example, for a particular edge
+  type, all node identifiers (source, destination), features, and labels should
+  exist as columns in a single file source.
+
+Apart from the data, GSProcessing also requires a configuration file that describes the
+data and the transformations we will need to apply to the features and any encoding needed for
+labels.
+We support both the `GConstruct configuration format <https://graphstorm.readthedocs.io/en/latest/configuration/configuration-gconstruction.html#configuration-json-explanations>`_
+, and the library's own GSProcessing format, described in :doc:`/developer/input-configuration`.
+
+.. note::
+    We expect end users to only provide a GConstruct configuration file,
+    and only use the configuration format of GSProcessing as an intermediate
+    layer to decouple the two projects.
+
+    Developers who are looking to use GSProcessing
+    as their backend processing engine can either use the GSProcessing configuration
+    format directly, or translate their own configuration format to GSProcessing,
+    as we do with GConstruct.
+
+For a detailed description of all the entries of the GSProcessing configuration file see
+:doc:`/developer/input-configuration`.
+
+Relative file paths required
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The one difference with single-instance GConstruct files,
+is that we require that the file paths listed in the configuration file are
+`relative to the location of the configuration file.` Specifically:
+
+* All file paths listed **must not** start with ``/``.
+* Assuming the configuration file is under ``$PATH``, and a filepath is listed as ``${FILEPATH}``
+  in the configuration file, the corresponding file is expected to exist at ``${PATH}/${FILEPATH}``.
+
+For example:
+
+.. code-block:: bash
+
+    > pwd
+    /home/path/to/data/ # This is the current working directory (cwd)
+    > ls
+    gconstruct-config.json edge_data # These are the files under the cwd
+    > ls edge_data/ # These are the files under the edge_data directory
+    movie-included_in-genre.csv
+
+The contents of the ``gconstruct-config.json`` can be:
+
+.. code-block:: python
+
+    {
+        "edges" : [
+            {
+                # Note that the file is a relative path
+                "files": ["edges/movie-included_in-genre.csv"],
+                "format": {
+                    "name": "csv",
+                    "separator" : ","
+                }
+                # [...] Other edge config values
+            }
+        ]
+    }
+
+Given the above we can run a job with local input data as:
+
+.. code-block:: bash
+
+    > gs-processing --input-data /home/path/to/data \
+        --config-filename gconstruct-config.json
+
+The benefit with using relative paths is that we can move the same files
+to any location, including S3, and run the same job without making changes to the config
+file:
+
+.. code-block:: bash
+
+    # Move all files to new directory
+    > mv /home/path/to/data /home/new-path/to/data
+    # After moving all the files we can still use the same config
+    > gs-processing --input-data /home/new-path/to/data \
+        --config-filename gconstruct-config.json
+
+    # Upload data to S3
+    > aws s3 sync /home/new-path/to/data s3://my-bucket/data/
+    # We can still use the same config, just change the prefix to an S3 path
+    > python run_distributed_processing.py --input-data s3://my-bucket/data \
+        --config-filename gconstruct-config.json
+
+Node files are optional
+^^^^^^^^^^^^^^^^^^^^^^^
+
+GSProcessing does not require node files to be provided for
+every node type. If a node type appears in one of the edges,
+its unique node identifiers will be determined by the edge files.
+
+In the example GConstruct file above (`gconstruct-config.json`), the node ids for the node types
+``movie`` and ``genre`` will be extracted from the edge list provided.
+
+Example data and configuration
+------------------------------
+
+For this example we use a small heterogeneous graph inspired by the Movielens dataset.
+You can see the configuration file under
+``graphstorm-processing/tests/resources/small_heterogeneous_graph/gconstruct-config.json``
+
+We have 4 node types, ``movie``, ``genre``, ``director``, and ``user``. The graph has 3
+edge types, ``movie:included_in:genre``, ``user:rated:movie``, and ``director:directed:movie``.
+
+We include one ``no-op`` feature, ``age``, that we directly pass to the output without any transformation,
+and one label, ``gender``, that we transform to prepare the data for a node classification task.
+
+
+Run a GSProcessing job locally
+------------------------------
+
+While GSProcessing is designed to run on distributed clusters,
+we can also run small jobs in a local environment, using a local Spark instance.
+
+To do so, we will be using the ``gs-processing`` entry point,
+to process the data and create the output on our local storage.
+
+We will provide an input and output prefix for our data, passing
+local paths to the script.
+
+We also provide the argument ``--num-output-files`` that instructs PySpark
+to try and create output with 4 partitions [#f1]_.
+
+Assuming our working directory is ``graphstorm/graphstorm-processing/``
+we can use the following command to run the processing job locally:
+
+.. code-block:: bash
+
+    gs-processing --config-filename gconstruct-config.json \
+        --input-prefix ./tests/resources/small_heterogeneous_graph \
+        --output-prefix /tmp/gsprocessing-example/ \
+        --num-output-files 4
+
+
+To finalize processing and to wrangle the data into the structure that
+DGL distributed partitioning expects, we need an additional step that
+guarantees the data conform to the expectations of DGL:
+
+.. code-block:: bash
+
+    gs-repartition --input-prefix /tmp/gsprocessing-example/
+
+
+Examining the job output
+------------------------
+
+Once the processing and re-partitioning jobs are done,
+we can examine the outputs they created. The output will be
+compatible with the `Chunked Graph Format of DistDGL <https://docs.dgl.ai/guide/distributed-preprocessing.html#chunked-graph-format>`_
+and can be used downstream to create a partitioned graph.
+
+.. code-block:: bash
+
+    $ cd /tmp/gsprocessing-example
+    $ ls
+
+    edges/  launch_arguments.json  metadata.json  node_data/
+    node_id_mappings/  perf_counters.json  updated_row_counts_metadata.json
+
+We have a few JSON files and the data directories containing
+the graph structure, features, and labels. In more detail:
+
+* ``launch_arguments.json``: Contains the arguments that were used
+  to launch the processing job, allowing you to check the parameters after the
+  job finishes.
+* ``updated_row_counts_metadata.json``:
+  This file is meant to be used as the input configuration for the
+  distributed partitioning pipeline. ``repartition_files.py`` produces
+  this file using the original ``metadata.json`` file as input.
+* ``metadata.json``: Created by ``gs-processing`` and used as input
+  for ``repartition_files.py``, can be removed once that script has run.
+* ``perf_counters.json``: A JSON file that contains runtime measurements
+  for the various components of GSProcessing. Can be used to profile the
+  application and discover bottlenecks.
+
+The directories created contain:
+
+* ``edges``: Contains the edge structures, one sub-directory per edge
+  type. Each edge file will contain two columns, the source and destination
+  `numerical` node id, named ``src_int_id`` and ``dist_int_id`` respectively.
+* ``node_data``: Contains the features for the nodes, one sub-directory
+  per node type. Each file will contain one column named after the original
+  feature name that contains the value of the feature (could be a scalar or a vector).
+* ``node_id_mappings``: Contains mappings from the original node ids to the
+  ones created by the processing job. This mapping would allow you to trace
+  back predictions to the original nodes/edges. The files will have two columns,
+  ``node_str_id`` that contains the original string ID of the node, and ``node_int_id``
+  that contains the numerical id that the string id was mapped to.
+
+If the graph had included edge features they would appear
+in an ``edge_data`` directory.
+
+.. note::
+
+    It's important to note that files for edges and edge data will have the
+    same order and row counts per file, as expected by DistDGL. Similarly,
+    all node feature files will have the same order and row counts, where
+    the first row corresponds to the feature value for node id 0, the second
+    for node id 1 etc.
+
+
+At this point you can use the DGL distributed partitioning pipeline
+to partition your data, as described in the
+`DGL documentation <https://docs.dgl.ai/guide/distributed-preprocessing.html#distributed-graph-partitioning-pipeline>`_
+
+To simplify the process of partitioning and training, without the need
+to manage your own infrastructure, we recommend using GraphStorm's
+`SageMaker wrappers <https://graphstorm.readthedocs.io/en/latest/scale/sagemaker.html>`_
+that do all the hard work for you and allow
+you to focus on model development.
+
+To run GSProcessing jobs on Amazon SageMaker we'll need to follow
+:doc:`/usage/distributed-processing-setup` to set up our environment
+and :doc:`/usage/amazon-sagemaker` to execute the job.
+
+
+.. rubric:: Footnotes
+
+
+.. [#f1] Note that this is just a hint to the Spark engine, and it's
+    not guaranteed that the number of output partitions will always match
+    the requested value.
\ No newline at end of file