[paddle] add documentation (#489)

* paddle documentation Signed-off-by: Kirthi Shankar Sivamani <[email protected]> * minor fix Signed-off-by: Kirthi Shankar Sivamani <[email protected]> * review comments Signed-off-by: Kirthi Shankar Sivamani <[email protected]> --------- Signed-off-by: Kirthi Shankar Sivamani <[email protected]>
NVIDIA · Oct 31, 2023 · 66d91d5 · 66d91d5
1 parent d58c08c
commit 66d91d5
Show file tree

Hide file tree

Showing 11 changed files with 288 additions and 31 deletions.
diff --git a/docs/api/framework.rst b/docs/api/framework.rst
@@ -10,3 +10,4 @@ Framework-specific API
 
     pytorch
     jax
+    paddle
diff --git a/docs/api/paddle.rst b/docs/api/paddle.rst
@@ -0,0 +1,34 @@
+..
+    Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+    See LICENSE for license information.
+
+paddle
+======
+
+.. autoapiclass:: transformer_engine.paddle.Linear(in_features, out_features, **kwargs)
+  :members: forward
+
+.. autoapiclass:: transformer_engine.paddle.LayerNorm(hidden_size, eps=1e-5, **kwargs)
+
+.. autoapiclass:: transformer_engine.paddle.LayerNormLinear(in_features, out_features, eps=1e-5, **kwargs)
+  :members: forward
+
+.. autoapiclass:: transformer_engine.paddle.LayerNormMLP(hidden_size, ffn_hidden_size, eps=1e-5, **kwargs)
+  :members: forward
+
+.. autoapiclass:: transformer_engine.paddle.FusedScaleMaskSoftmax(attn_mask_type, mask_func, **kwargs)
+  :members: forward
+
+.. autoapiclass:: transformer_engine.paddle.DotProductAttention(num_attention_heads, kv_channels, **kwargs)
+  :members: forward
+
+.. autoapiclass:: transformer_engine.paddle.MultiHeadAttention(hidden_size, num_attention_heads, **kwargs)
+  :members: forward
+
+.. autoapiclass:: transformer_engine.paddle.TransformerLayer(hidden_size, ffn_hidden_size, num_attention_heads, **kwargs)
+  :members: forward
+
+.. autoapifunction:: transformer_engine.paddle.fp8_autocast
+
+.. autoapifunction:: transformer_engine.paddle.recompute
diff --git a/transformer_engine/paddle/fp8.py b/transformer_engine/paddle/fp8.py
@@ -15,6 +15,10 @@
 from .constants import dist_group_type
 from .fp8_buffer import FP8MetaFwdBuffer, FP8MetaBwdBuffer, FP8RecomputeBuffer
 
+
+__all__ = ['fp8_autocast']
+
+
 # FP8 support
 _is_fp8_available = None
 _reason_for_no_fp8 = ""
@@ -166,6 +170,40 @@ def fp8_autocast(
 ) -> None:
     """
     Context manager for FP8 usage.
+
+    .. code-block:: python
+
+        with fp8_autocast(enabled=True):
+            out = model(inp)
+
+    .. note::
+
+        Support for FP8 in the Linear layer of Transformer Engine is currently limited to tensors
+        with shapes where both dimensions are divisible by 16. In terms of the input to the full
+        Transformer network, this typically requires padding sequence length to be multiple of 16.
+
+    .. note::
+
+        When :attr:`fp8_recipe.reduce_amax==True`, any module must not be invoked more than once
+        inside a single `fp8_autocast` region. This is unsupported behavior because the amax
+        reduction is handled during the exit of the `fp8_autocast` context. Calling the same
+        module more than once inside an `fp8_autocast` region overrides the amax tensors
+        before reduction can occur.
+
+    Parameters
+    ----------
+    enabled: bool, default = `False`
+             whether or not to enable fp8
+    calibrating: bool, default = `False`
+                 calibration mode allows collecting statistics such as amax and scale
+                 data of fp8 tensors even when executing without fp8 enabled. This is
+                 useful for saving an inference ready fp8 checkpoint while training
+                 using a higher precision.
+    fp8_recipe: recipe.DelayedScaling, default = `None`
+                recipe used for FP8 training.
+    fp8_group: paddle.distributed.collective.Group, default = `None`
+               distributed group over which amaxes for the fp8 tensors
+               are reduced at the end of each training step.
     """
     try:
         _global_fp8_state.enter(enabled, calibrating, fp8_recipe, fp8_group)

diff --git a/transformer_engine/paddle/layer/attention.py b/transformer_engine/paddle/layer/attention.py
@@ -29,6 +29,9 @@
 from ..recompute import recompute
 
 
+__all__ = ["DotProductAttention", "MultiHeadAttention"]
+
+
 class FusedAttnFuncPackedQKV(paddle.autograd.PyLayer):
     """Function for FusedAttention with packed QKV input"""
 
@@ -129,7 +132,7 @@ def backward(ctx, d_out):
 
 
 class DotProductAttention(paddle.nn.Layer):
-    """Dot Product Attention Layer
+    """
     Allows the model to jointly attend to information from different
     representation subspaces as described in the paper:
     `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
@@ -150,8 +153,7 @@ class DotProductAttention(paddle.nn.Layer):
     attention_type: {'self', 'cross'}, default = `self`
                     type of attention operation.
     backend: {'transformer_engine', 'paddle'}, default = `transformer_engine`
-                backend to use for attention operation.
-
+             backend to use for attention operation.
     """
 
     def __init__(self,
@@ -215,17 +217,17 @@ def forward(
         Parameters
         ----------
         query_layer : paddle.Tensor
-                     Query tensor.
+                      Query tensor.
         key_value_layer : paddle.Tensor
-                   Key tensor.
+                          Key tensor.
         attention_mask : Optional[paddle.Tensor], default = `None`
-                        Boolean tensor used to mask out softmax input when not using attention.
+                         Boolean tensor used to mask out softmax input when not using attention.
         core_attention_bias_type: str, default = `no_bias`
-                                only support no_bias type currently, {`no_bias`}
+                                  only support no_bias type currently, {`no_bias`}
         core_attention_bias: Optional[paddle.Tensor], default = `None`
-                    Bias tensor for Q * K.T
-        set_zero: bool, defautl = `True`
-                    Whether to use the fast path to set output tensors to 0 or not.
+                             Bias tensor for Q * K.T
+        set_zero: bool, default = `True`
+                  Whether to use the fast path to set output tensors to 0 or not.
         """
 
         backend = self.backend
@@ -358,7 +360,9 @@ def _pd_forward(
 
 
 class MultiHeadAttention(paddle.nn.Layer):
-    """Attention w/ QKV and Proj Gemms
+    """
+    Multi-head Attention (MHA), including Query,
+    Key, Value and Output projection.
 
     Parameters
     ----------
@@ -387,7 +391,8 @@ class MultiHeadAttention(paddle.nn.Layer):
     zero_centered_gamma: bool, default = `False`
                     whether to zero initialize the gamma of the layernorm operation.
     backend: {'transformer_engine', 'paddle'}, default = `transformer_engine`
-                backend to use for attention operation.
+             backend to use for attention operation. If set to 'paddle', a framework
+             only no-FP8 path is executed with limited optimization.
 
     Parallelism parameters
     ----------------------
@@ -542,7 +547,6 @@ def forward(
         """
         MultiHeadAttention Layer.
 
-
         Parameters
         ----------
         hidden_states : paddle.Tensor
@@ -555,7 +559,7 @@ def forward(
                                 only support no_bias type currently, {`no_bias`}
         core_attention_bias: Optional[paddle.Tensor], default = `None`
                     Bias tensor for Q * K.T
-        set_zero: bool, defautl = `True`
+        set_zero: bool, default = `True`
                     Whether to use the fast path to set output tensors to 0 or not.
         recompute_core_attention: bool, default = `False`
                                   If true, forward activations for core attention are recomputed

diff --git a/transformer_engine/paddle/layer/layernorm.py b/transformer_engine/paddle/layer/layernorm.py
@@ -63,7 +63,33 @@ def backward(ctx, grad_output: paddle.Tensor) -> Tuple[Union[paddle.Tensor, None
 class LayerNorm(paddle.nn.Layer):
     r"""
     Applies Layer Normalization over a mini-batch of inputs as described in
-    the paper `Layer Normalization <https://arxiv.org/abs/1607.06450>`
+    the paper `Layer Normalization <https://arxiv.org/abs/1607.06450>`__
+
+    .. math::
+        y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \varepsilon}} * \gamma + \beta
+
+    :math:`\gamma` and :math:`\beta` are learnable affine transform parameters of
+    size :attr:`hidden_size`
+
+    Parameters
+    ----------
+    hidden_size : int
+                size of each input sample.
+    eps : float, default = 1e-5
+        a value added to the denominator of layer normalization for numerical stability.
+    weight_attr: Union[paddle.ParamAttr, None], default = None
+                optional `paddle.ParamAttr` for weight.
+    bias_attr: Union[paddle.ParamAttr, None, bool], default = None
+              optional `paddle.ParamAttr` for bias.
+    zero_centered_gamma : bool, default = 'False'
+                         if set to 'True', gamma parameter in LayerNorm is initialized to 0 and
+                         the LayerNorm formula changes to
+
+                         .. math::
+                            y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \varepsilon}} *
+                            (1 + \gamma) + \beta
+    backend: {'transformer_engine', 'paddle'}, default = `transformer_engine`
+             backend to use for softmax operation.
     """
 
     def __init__(

diff --git a/transformer_engine/paddle/layer/layernorm_linear.py b/transformer_engine/paddle/layer/layernorm_linear.py
@@ -40,7 +40,7 @@
     saved_tensor_allow_none,
 )
 
-__all__ = ["LayerNormLinear", "_layernorm_fwd_fp8_cast", "_layernorm_bwd"]
+__all__ = ["LayerNormLinear"]
 
 
 def _layernorm_fwd_fp8_cast(
@@ -331,6 +331,42 @@ def backward(
 class LayerNormLinear(TransformerEngineBaseLayer):
     r"""
     Applies layer normalization followed by linear transformation to the incoming data.
+
+    Parameters
+    ----------
+    in_features : int
+                 size of each input sample.
+    out_features : int
+                  size of each output sample.
+    eps : float, default = 1e-5
+         a value added to the denominator of layer normalization for numerical stability.
+    weight_attr: Union[paddle.ParamAttr, None], default = None
+                optional `paddle.ParamAttr` for weight.
+    bias_attr: Union[paddle.ParamAttr, None, bool], default = None
+              optional `paddle.ParamAttr` for bias.
+    return_layernorm_output : bool, default = `False`
+                             if set to `True`, output of layernorm is returned from the forward
+                             together with the output of the linear transformation.
+                             Example use case: residual connection for transformer module is
+                             taken post layernorm.
+    zero_centered_gamma : bool, default = 'False'
+                         if set to 'True', gamma parameter in LayerNorm is initialized to 0 and
+                         the LayerNorm formula changes to
+
+                         .. math::
+                            y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \varepsilon}} *
+                            (1 + \gamma) + \beta
+    backend: {'transformer_engine', 'paddle'}, default = 'transformer_engine'
+             if set to 'paddle', a framework only no-FP8 path is executed with limited optimization.
+
+    Parallelism parameters
+    ----------------------
+    tp_group : ProcessGroup, default = `None`
+              tensor parallel process group.
+    parallel_mode : {None, 'Column', 'Row'}, default = `None`
+                   used to decide whether this Linear layer is Column Parallel Linear or Row
+                   Parallel Linear as described `here <https://arxiv.org/pdf/1909.08053.pdf>`_.
+                   When set to `None`, no communication is performed.
     """
 
     def __init__(
@@ -503,7 +539,14 @@ def _pd_forward(
         return out
 
     def forward(self, *args, **kwargs):
-        """forward"""
+        """
+        Apply layer normalization to the input followed by a linear transformation.
+
+        Parameters
+        ----------
+        inp : torch.Tensor
+             Input tensor.
+        """
         if self.backend == 'transformer_engine':
             return self._te_forward(*args, **kwargs)
         if self.backend == 'paddle':

diff --git a/transformer_engine/paddle/layer/layernorm_mlp.py b/transformer_engine/paddle/layer/layernorm_mlp.py
@@ -39,6 +39,7 @@
     saved_tensor_allow_none,
 )
 
+
 __all__ = ["LayerNormMLP"]
 
 
@@ -549,7 +550,47 @@ def backward(
 
 class LayerNormMLP(TransformerEngineBaseLayer):
     r"""
-    Applies layer normalization followed by linear transformation to the incoming data.
+    Applies layer normalization on the input followed by the MLP module, consisting of
+    2 successive linear transformations, separated by the GeLU activation.
+
+    Parameters
+    ----------
+    hidden_size : int
+                 size of each input sample.
+    ffn_hidden_size : int
+                     intermediate size to which input samples are projected.
+    eps : float, default = 1e-5
+         a value added to the denominator of layer normalization for numerical stability.
+    weight_attr: Union[paddle.ParamAttr, None], default = None
+                optional `paddle.ParamAttr` for weight.
+    bias_attr: Union[paddle.ParamAttr, None, bool], default = None
+              optional `paddle.ParamAttr` for bias.
+    activation : str, default = 'gelu'
+          activation function used.
+          Options: 'gelu', 'geglu', 'relu', 'reglu', 'squared_relu', 'swiglu'.
+    return_layernorm_output : bool, default = `False`
+                             if set to `True`, output of layernorm is returned from the forward
+                             together with the output of the linear transformation.
+                             Example use case: residual connection for transformer module
+                             is taken post layernorm.
+    zero_centered_gamma : bool, default = 'False'
+                         if set to 'True', gamma parameter in LayerNorm is initialized to 0 and
+                         the LayerNorm formula changes to
+
+                         .. math::
+                            y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \varepsilon}} *
+                            (1 + \gamma) + \beta
+    backend: {'transformer_engine', 'paddle'}, default = 'transformer_engine'
+             if set to 'paddle', a framework only no-FP8 path is executed with limited optimization.
+
+    Parallelism parameters
+    ----------------------
+    set_parallel_mode : bool, default = `False`
+                      if set to `True`, FC1 is used as Column Parallel and FC2 is used as Row
+                      Parallel as described `here <https://arxiv.org/pdf/1909.08053.pdf>`_.
+    tp_group : paddle.distributed.collective.Group, default = `None`
+               tensor parallel process group.
+
     """
 
     def __init__(
@@ -753,7 +794,14 @@ def _pd_forward(
         return out
 
     def forward(self, *args, **kwargs):
-        """forward"""
+        """
+        Apply layer normalization to the input followed by a feedforward network (MLP Block).
+
+        Parameters
+        ----------
+        inp : torch.Tensor
+             Input tensor.
+        """
         if self.backend == 'transformer_engine':
             return self._te_forward(*args, **kwargs)
         if self.backend == 'paddle':