huggingface · younesbelkada · Jun 21, 2023 · Jun 13, 2023 · Jun 14, 2023 · Jun 14, 2023
diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py
@@ -29,6 +29,7 @@
     SequenceClassifierOutputWithPast,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import get_checkpointing_kwargs
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -700,12 +701,15 @@ def custom_forward(*inputs):
 
                     return custom_forward
 
+                gradient_checkpointing_kwargs = get_checkpointing_kwargs()
+
                 layer_outputs = torch.utils.checkpoint.checkpoint(
                     create_custom_forward(decoder_layer),
                     hidden_states,
                     causal_attention_mask,
                     head_mask[idx] if head_mask is not None else None,
                     None,
+                    **gradient_checkpointing_kwargs,
                 )
             else:
                 layer_outputs = decoder_layer(

diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py
@@ -289,3 +289,15 @@ def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]:
     non-overlapping lifetimes may have the same id.
     """
     return tensor.device, storage_ptr(tensor), storage_size(tensor)
+
+
+def get_checkpointing_kwargs() -> dict:
+    r"""
+    Get the correct kwargs to correctly use `torch.utils.checkpoint.checkpoint` as the default call leads to silent
+    bugs that leads to the gradients of the last layers not being updated. For more in depth detail of the issue,
+    please have a look at: https://github.com/huggingface/transformers/pull/24247
+    """
+    returned_kwargs = {}
+    if "use_reentrant" in list(inspect.signature(torch.utils.checkpoint.checkpoint).parameters):
+        returned_kwargs["use_reentrant"] = False
+    return returned_kwargs