Skip to content

Commit

Permalink
[paddle] add documentation (#489)
Browse files Browse the repository at this point in the history
* paddle documentation

Signed-off-by: Kirthi Shankar Sivamani <[email protected]>

* minor fix

Signed-off-by: Kirthi Shankar Sivamani <[email protected]>

* review comments

Signed-off-by: Kirthi Shankar Sivamani <[email protected]>

---------

Signed-off-by: Kirthi Shankar Sivamani <[email protected]>
  • Loading branch information
ksivaman committed Oct 31, 2023
1 parent d58c08c commit 66d91d5
Show file tree
Hide file tree
Showing 11 changed files with 288 additions and 31 deletions.
1 change: 1 addition & 0 deletions docs/api/framework.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ Framework-specific API

pytorch
jax
paddle
34 changes: 34 additions & 0 deletions docs/api/paddle.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
..
Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
See LICENSE for license information.

paddle
======

.. autoapiclass:: transformer_engine.paddle.Linear(in_features, out_features, **kwargs)
:members: forward

.. autoapiclass:: transformer_engine.paddle.LayerNorm(hidden_size, eps=1e-5, **kwargs)

.. autoapiclass:: transformer_engine.paddle.LayerNormLinear(in_features, out_features, eps=1e-5, **kwargs)
:members: forward

.. autoapiclass:: transformer_engine.paddle.LayerNormMLP(hidden_size, ffn_hidden_size, eps=1e-5, **kwargs)
:members: forward

.. autoapiclass:: transformer_engine.paddle.FusedScaleMaskSoftmax(attn_mask_type, mask_func, **kwargs)
:members: forward

.. autoapiclass:: transformer_engine.paddle.DotProductAttention(num_attention_heads, kv_channels, **kwargs)
:members: forward

.. autoapiclass:: transformer_engine.paddle.MultiHeadAttention(hidden_size, num_attention_heads, **kwargs)
:members: forward

.. autoapiclass:: transformer_engine.paddle.TransformerLayer(hidden_size, ffn_hidden_size, num_attention_heads, **kwargs)
:members: forward

.. autoapifunction:: transformer_engine.paddle.fp8_autocast

.. autoapifunction:: transformer_engine.paddle.recompute
38 changes: 38 additions & 0 deletions transformer_engine/paddle/fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
from .constants import dist_group_type
from .fp8_buffer import FP8MetaFwdBuffer, FP8MetaBwdBuffer, FP8RecomputeBuffer


__all__ = ['fp8_autocast']


# FP8 support
_is_fp8_available = None
_reason_for_no_fp8 = ""
Expand Down Expand Up @@ -166,6 +170,40 @@ def fp8_autocast(
) -> None:
"""
Context manager for FP8 usage.
.. code-block:: python
with fp8_autocast(enabled=True):
out = model(inp)
.. note::
Support for FP8 in the Linear layer of Transformer Engine is currently limited to tensors
with shapes where both dimensions are divisible by 16. In terms of the input to the full
Transformer network, this typically requires padding sequence length to be multiple of 16.
.. note::
When :attr:`fp8_recipe.reduce_amax==True`, any module must not be invoked more than once
inside a single `fp8_autocast` region. This is unsupported behavior because the amax
reduction is handled during the exit of the `fp8_autocast` context. Calling the same
module more than once inside an `fp8_autocast` region overrides the amax tensors
before reduction can occur.
Parameters
----------
enabled: bool, default = `False`
whether or not to enable fp8
calibrating: bool, default = `False`
calibration mode allows collecting statistics such as amax and scale
data of fp8 tensors even when executing without fp8 enabled. This is
useful for saving an inference ready fp8 checkpoint while training
using a higher precision.
fp8_recipe: recipe.DelayedScaling, default = `None`
recipe used for FP8 training.
fp8_group: paddle.distributed.collective.Group, default = `None`
distributed group over which amaxes for the fp8 tensors
are reduced at the end of each training step.
"""
try:
_global_fp8_state.enter(enabled, calibrating, fp8_recipe, fp8_group)
Expand Down
32 changes: 18 additions & 14 deletions transformer_engine/paddle/layer/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@
from ..recompute import recompute


__all__ = ["DotProductAttention", "MultiHeadAttention"]


class FusedAttnFuncPackedQKV(paddle.autograd.PyLayer):
"""Function for FusedAttention with packed QKV input"""

Expand Down Expand Up @@ -129,7 +132,7 @@ def backward(ctx, d_out):


class DotProductAttention(paddle.nn.Layer):
"""Dot Product Attention Layer
"""
Allows the model to jointly attend to information from different
representation subspaces as described in the paper:
`Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
Expand All @@ -150,8 +153,7 @@ class DotProductAttention(paddle.nn.Layer):
attention_type: {'self', 'cross'}, default = `self`
type of attention operation.
backend: {'transformer_engine', 'paddle'}, default = `transformer_engine`
backend to use for attention operation.
backend to use for attention operation.
"""

def __init__(self,
Expand Down Expand Up @@ -215,17 +217,17 @@ def forward(
Parameters
----------
query_layer : paddle.Tensor
Query tensor.
Query tensor.
key_value_layer : paddle.Tensor
Key tensor.
Key tensor.
attention_mask : Optional[paddle.Tensor], default = `None`
Boolean tensor used to mask out softmax input when not using attention.
Boolean tensor used to mask out softmax input when not using attention.
core_attention_bias_type: str, default = `no_bias`
only support no_bias type currently, {`no_bias`}
only support no_bias type currently, {`no_bias`}
core_attention_bias: Optional[paddle.Tensor], default = `None`
Bias tensor for Q * K.T
set_zero: bool, defautl = `True`
Whether to use the fast path to set output tensors to 0 or not.
Bias tensor for Q * K.T
set_zero: bool, default = `True`
Whether to use the fast path to set output tensors to 0 or not.
"""

backend = self.backend
Expand Down Expand Up @@ -358,7 +360,9 @@ def _pd_forward(


class MultiHeadAttention(paddle.nn.Layer):
"""Attention w/ QKV and Proj Gemms
"""
Multi-head Attention (MHA), including Query,
Key, Value and Output projection.
Parameters
----------
Expand Down Expand Up @@ -387,7 +391,8 @@ class MultiHeadAttention(paddle.nn.Layer):
zero_centered_gamma: bool, default = `False`
whether to zero initialize the gamma of the layernorm operation.
backend: {'transformer_engine', 'paddle'}, default = `transformer_engine`
backend to use for attention operation.
backend to use for attention operation. If set to 'paddle', a framework
only no-FP8 path is executed with limited optimization.
Parallelism parameters
----------------------
Expand Down Expand Up @@ -542,7 +547,6 @@ def forward(
"""
MultiHeadAttention Layer.
Parameters
----------
hidden_states : paddle.Tensor
Expand All @@ -555,7 +559,7 @@ def forward(
only support no_bias type currently, {`no_bias`}
core_attention_bias: Optional[paddle.Tensor], default = `None`
Bias tensor for Q * K.T
set_zero: bool, defautl = `True`
set_zero: bool, default = `True`
Whether to use the fast path to set output tensors to 0 or not.
recompute_core_attention: bool, default = `False`
If true, forward activations for core attention are recomputed
Expand Down
28 changes: 27 additions & 1 deletion transformer_engine/paddle/layer/layernorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,33 @@ def backward(ctx, grad_output: paddle.Tensor) -> Tuple[Union[paddle.Tensor, None
class LayerNorm(paddle.nn.Layer):
r"""
Applies Layer Normalization over a mini-batch of inputs as described in
the paper `Layer Normalization <https://arxiv.org/abs/1607.06450>`
the paper `Layer Normalization <https://arxiv.org/abs/1607.06450>`__
.. math::
y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \varepsilon}} * \gamma + \beta
:math:`\gamma` and :math:`\beta` are learnable affine transform parameters of
size :attr:`hidden_size`
Parameters
----------
hidden_size : int
size of each input sample.
eps : float, default = 1e-5
a value added to the denominator of layer normalization for numerical stability.
weight_attr: Union[paddle.ParamAttr, None], default = None
optional `paddle.ParamAttr` for weight.
bias_attr: Union[paddle.ParamAttr, None, bool], default = None
optional `paddle.ParamAttr` for bias.
zero_centered_gamma : bool, default = 'False'
if set to 'True', gamma parameter in LayerNorm is initialized to 0 and
the LayerNorm formula changes to
.. math::
y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \varepsilon}} *
(1 + \gamma) + \beta
backend: {'transformer_engine', 'paddle'}, default = `transformer_engine`
backend to use for softmax operation.
"""

def __init__(
Expand Down
47 changes: 45 additions & 2 deletions transformer_engine/paddle/layer/layernorm_linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
saved_tensor_allow_none,
)

__all__ = ["LayerNormLinear", "_layernorm_fwd_fp8_cast", "_layernorm_bwd"]
__all__ = ["LayerNormLinear"]


def _layernorm_fwd_fp8_cast(
Expand Down Expand Up @@ -331,6 +331,42 @@ def backward(
class LayerNormLinear(TransformerEngineBaseLayer):
r"""
Applies layer normalization followed by linear transformation to the incoming data.
Parameters
----------
in_features : int
size of each input sample.
out_features : int
size of each output sample.
eps : float, default = 1e-5
a value added to the denominator of layer normalization for numerical stability.
weight_attr: Union[paddle.ParamAttr, None], default = None
optional `paddle.ParamAttr` for weight.
bias_attr: Union[paddle.ParamAttr, None, bool], default = None
optional `paddle.ParamAttr` for bias.
return_layernorm_output : bool, default = `False`
if set to `True`, output of layernorm is returned from the forward
together with the output of the linear transformation.
Example use case: residual connection for transformer module is
taken post layernorm.
zero_centered_gamma : bool, default = 'False'
if set to 'True', gamma parameter in LayerNorm is initialized to 0 and
the LayerNorm formula changes to
.. math::
y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \varepsilon}} *
(1 + \gamma) + \beta
backend: {'transformer_engine', 'paddle'}, default = 'transformer_engine'
if set to 'paddle', a framework only no-FP8 path is executed with limited optimization.
Parallelism parameters
----------------------
tp_group : ProcessGroup, default = `None`
tensor parallel process group.
parallel_mode : {None, 'Column', 'Row'}, default = `None`
used to decide whether this Linear layer is Column Parallel Linear or Row
Parallel Linear as described `here <https://arxiv.org/pdf/1909.08053.pdf>`_.
When set to `None`, no communication is performed.
"""

def __init__(
Expand Down Expand Up @@ -503,7 +539,14 @@ def _pd_forward(
return out

def forward(self, *args, **kwargs):
"""forward"""
"""
Apply layer normalization to the input followed by a linear transformation.
Parameters
----------
inp : torch.Tensor
Input tensor.
"""
if self.backend == 'transformer_engine':
return self._te_forward(*args, **kwargs)
if self.backend == 'paddle':
Expand Down
52 changes: 50 additions & 2 deletions transformer_engine/paddle/layer/layernorm_mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
saved_tensor_allow_none,
)


__all__ = ["LayerNormMLP"]


Expand Down Expand Up @@ -549,7 +550,47 @@ def backward(

class LayerNormMLP(TransformerEngineBaseLayer):
r"""
Applies layer normalization followed by linear transformation to the incoming data.
Applies layer normalization on the input followed by the MLP module, consisting of
2 successive linear transformations, separated by the GeLU activation.
Parameters
----------
hidden_size : int
size of each input sample.
ffn_hidden_size : int
intermediate size to which input samples are projected.
eps : float, default = 1e-5
a value added to the denominator of layer normalization for numerical stability.
weight_attr: Union[paddle.ParamAttr, None], default = None
optional `paddle.ParamAttr` for weight.
bias_attr: Union[paddle.ParamAttr, None, bool], default = None
optional `paddle.ParamAttr` for bias.
activation : str, default = 'gelu'
activation function used.
Options: 'gelu', 'geglu', 'relu', 'reglu', 'squared_relu', 'swiglu'.
return_layernorm_output : bool, default = `False`
if set to `True`, output of layernorm is returned from the forward
together with the output of the linear transformation.
Example use case: residual connection for transformer module
is taken post layernorm.
zero_centered_gamma : bool, default = 'False'
if set to 'True', gamma parameter in LayerNorm is initialized to 0 and
the LayerNorm formula changes to
.. math::
y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \varepsilon}} *
(1 + \gamma) + \beta
backend: {'transformer_engine', 'paddle'}, default = 'transformer_engine'
if set to 'paddle', a framework only no-FP8 path is executed with limited optimization.
Parallelism parameters
----------------------
set_parallel_mode : bool, default = `False`
if set to `True`, FC1 is used as Column Parallel and FC2 is used as Row
Parallel as described `here <https://arxiv.org/pdf/1909.08053.pdf>`_.
tp_group : paddle.distributed.collective.Group, default = `None`
tensor parallel process group.
"""

def __init__(
Expand Down Expand Up @@ -753,7 +794,14 @@ def _pd_forward(
return out

def forward(self, *args, **kwargs):
"""forward"""
"""
Apply layer normalization to the input followed by a feedforward network (MLP Block).
Parameters
----------
inp : torch.Tensor
Input tensor.
"""
if self.backend == 'transformer_engine':
return self._te_forward(*args, **kwargs)
if self.backend == 'paddle':
Expand Down
Loading

0 comments on commit 66d91d5

Please sign in to comment.