From b49f757908a7231c814af55922df219f3a35136a Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Thu, 22 Jun 2023 10:44:07 +0200 Subject: [PATCH] Revert "Fix gradient checkpointing + fp16 autocast for most models (#24247)" This reverts commit 285a48011da3145ae77c5b22bcfbe77d367e5173. --- .../models/albert/modeling_albert.py | 6 +--- .../models/align/modeling_align.py | 9 ++--- .../models/altclip/modeling_altclip.py | 11 ++---- .../modeling_audio_spectrogram_transformer.py | 4 +-- .../models/autoformer/modeling_autoformer.py | 5 ++- src/transformers/models/bart/modeling_bart.py | 5 ++- src/transformers/models/beit/modeling_beit.py | 4 +-- src/transformers/models/bert/modeling_bert.py | 9 ++--- .../modeling_bert_generation.py | 9 ++--- .../models/big_bird/modeling_big_bird.py | 4 +-- .../modeling_bigbird_pegasus.py | 5 ++- .../models/biogpt/modeling_biogpt.py | 3 +- .../models/blenderbot/modeling_blenderbot.py | 5 ++- .../modeling_blenderbot_small.py | 5 ++- src/transformers/models/blip/modeling_blip.py | 3 +- .../models/blip/modeling_blip_text.py | 3 +- .../models/blip_2/modeling_blip_2.py | 11 ++---- .../models/bloom/modeling_bloom.py | 3 +- .../bridgetower/modeling_bridgetower.py | 11 ++---- .../models/camembert/modeling_camembert.py | 9 ++--- .../models/canine/modeling_canine.py | 9 ++--- .../chinese_clip/modeling_chinese_clip.py | 11 ++---- src/transformers/models/clap/modeling_clap.py | 12 ++----- src/transformers/models/clip/modeling_clip.py | 3 +- .../models/clipseg/modeling_clipseg.py | 3 +- .../models/codegen/modeling_codegen.py | 3 +- .../modeling_conditional_detr.py | 3 +- .../models/convbert/modeling_convbert.py | 9 ++--- src/transformers/models/cvt/modeling_cvt.py | 3 +- .../data2vec/modeling_data2vec_audio.py | 5 ++- .../models/data2vec/modeling_data2vec_text.py | 9 ++--- .../data2vec/modeling_data2vec_vision.py | 4 +-- .../models/deberta/modeling_deberta.py | 4 +-- .../models/deberta_v2/modeling_deberta_v2.py | 4 +-- .../modeling_decision_transformer.py | 4 +-- .../modeling_deformable_detr.py | 4 +-- src/transformers/models/deit/modeling_deit.py | 4 +-- src/transformers/models/deta/modeling_deta.py | 4 +-- src/transformers/models/detr/modeling_detr.py | 3 +- .../models/donut/modeling_donut_swin.py | 4 +-- src/transformers/models/dpt/modeling_dpt.py | 4 +-- .../models/electra/modeling_electra.py | 9 ++--- .../models/ernie/modeling_ernie.py | 9 ++--- src/transformers/models/esm/modeling_esm.py | 5 ++- .../models/flava/modeling_flava.py | 5 ++- src/transformers/models/fnet/modeling_fnet.py | 4 +-- .../models/focalnet/modeling_focalnet.py | 3 +- src/transformers/models/git/modeling_git.py | 11 ++---- src/transformers/models/gpt2/modeling_gpt2.py | 10 ++---- .../gpt_bigcode/modeling_gpt_bigcode.py | 3 +- .../models/gpt_neo/modeling_gpt_neo.py | 3 +- .../models/gpt_neox/modeling_gpt_neox.py | 3 +- src/transformers/models/gptj/modeling_gptj.py | 3 +- .../models/groupvit/modeling_groupvit.py | 3 +- .../models/hubert/modeling_hubert.py | 7 ++-- .../models/imagegpt/modeling_imagegpt.py | 4 +-- .../models/informer/modeling_informer.py | 7 ++-- .../models/layoutlm/modeling_layoutlm.py | 9 ++--- .../models/layoutlmv2/modeling_layoutlmv2.py | 4 +-- .../models/layoutlmv3/modeling_layoutlmv3.py | 4 +-- src/transformers/models/led/modeling_led.py | 5 ++- src/transformers/models/lilt/modeling_lilt.py | 9 ++--- .../models/llama/modeling_llama.py | 3 +- .../models/longformer/modeling_longformer.py | 9 ++--- .../models/longt5/modeling_longt5.py | 10 ++---- src/transformers/models/luke/modeling_luke.py | 4 +-- .../models/m2m_100/modeling_m2m_100.py | 5 ++- .../models/marian/modeling_marian.py | 5 ++- .../models/markuplm/modeling_markuplm.py | 3 +- .../mask2former/modeling_mask2former.py | 3 +- .../models/maskformer/modeling_maskformer.py | 3 +- .../maskformer/modeling_maskformer_swin.py | 4 +-- .../models/mbart/modeling_mbart.py | 5 ++- .../models/mctct/modeling_mctct.py | 3 +- .../megatron_bert/modeling_megatron_bert.py | 9 ++--- .../models/mobilevit/modeling_mobilevit.py | 4 +-- .../mobilevitv2/modeling_mobilevitv2.py | 3 +- src/transformers/models/mt5/modeling_mt5.py | 5 +-- src/transformers/models/mvp/modeling_mvp.py | 5 ++- .../models/nezha/modeling_nezha.py | 9 ++--- .../models/nllb_moe/modeling_nllb_moe.py | 6 ++-- .../nystromformer/modeling_nystromformer.py | 9 ++--- .../models/oneformer/modeling_oneformer.py | 3 +- .../models/open_llama/modeling_open_llama.py | 3 +- src/transformers/models/opt/modeling_opt.py | 3 +- .../models/owlvit/modeling_owlvit.py | 3 +- .../models/pegasus/modeling_pegasus.py | 5 ++- .../models/pegasus_x/modeling_pegasus_x.py | 5 ++- .../models/pix2struct/modeling_pix2struct.py | 7 ++-- .../models/plbart/modeling_plbart.py | 5 ++- .../models/prophetnet/modeling_prophetnet.py | 5 ++- .../models/qdqbert/modeling_qdqbert.py | 4 +-- .../models/realm/modeling_realm.py | 9 ++--- .../models/rembert/modeling_rembert.py | 9 ++--- .../models/retribert/modeling_retribert.py | 4 +-- .../models/roberta/modeling_roberta.py | 9 ++--- .../modeling_roberta_prelayernorm.py | 9 ++--- .../models/roc_bert/modeling_roc_bert.py | 9 ++--- .../models/roformer/modeling_roformer.py | 9 ++--- src/transformers/models/sam/modeling_sam.py | 3 +- src/transformers/models/sew/modeling_sew.py | 5 ++- .../models/sew_d/modeling_sew_d.py | 6 ++-- .../speech_to_text/modeling_speech_to_text.py | 5 ++- .../modeling_speech_to_text_2.py | 3 +- .../models/speecht5/modeling_speecht5.py | 7 ++-- .../models/splinter/modeling_splinter.py | 9 ++--- src/transformers/models/swin/modeling_swin.py | 4 +-- .../models/swin2sr/modeling_swin2sr.py | 4 +-- .../models/swinv2/modeling_swinv2.py | 4 +-- .../modeling_switch_transformers.py | 10 ++---- src/transformers/models/t5/modeling_t5.py | 10 ++---- .../modeling_table_transformer.py | 3 +- .../models/tapas/modeling_tapas.py | 3 +- .../modeling_time_series_transformer.py | 5 ++- .../timesformer/modeling_timesformer.py | 3 +- .../modeling_trajectory_transformer.py | 3 +- .../models/trocr/modeling_trocr.py | 3 +- src/transformers/models/tvlt/modeling_tvlt.py | 6 ++-- .../models/unispeech/modeling_unispeech.py | 7 ++-- .../unispeech_sat/modeling_unispeech_sat.py | 7 ++-- .../models/videomae/modeling_videomae.py | 6 ++-- src/transformers/models/vilt/modeling_vilt.py | 3 +- .../visual_bert/modeling_visual_bert.py | 9 ++--- src/transformers/models/vit/modeling_vit.py | 4 +-- .../models/vit_hybrid/modeling_vit_hybrid.py | 4 +-- .../models/vit_mae/modeling_vit_mae.py | 6 ++-- .../models/vit_msn/modeling_vit_msn.py | 4 +-- .../models/wav2vec2/modeling_wav2vec2.py | 7 ++-- .../modeling_wav2vec2_conformer.py | 5 ++- .../models/wavlm/modeling_wavlm.py | 7 ++-- .../models/whisper/modeling_whisper.py | 5 ++- .../models/x_clip/modeling_x_clip.py | 5 ++- src/transformers/models/xglm/modeling_xglm.py | 3 +- .../xlm_prophetnet/modeling_xlm_prophetnet.py | 5 ++- .../xlm_roberta/modeling_xlm_roberta.py | 9 ++--- .../xlm_roberta_xl/modeling_xlm_roberta_xl.py | 9 ++--- src/transformers/models/xmod/modeling_xmod.py | 9 ++--- .../models/yolos/modeling_yolos.py | 4 +-- src/transformers/models/yoso/modeling_yoso.py | 9 ++--- src/transformers/pytorch_utils.py | 15 -------- ...ng_{{cookiecutter.lowercase_modelname}}.py | 8 ++--- tests/models/align/test_modeling_align.py | 6 ---- tests/models/altclip/test_modeling_altclip.py | 6 ---- .../autoformer/test_modeling_autoformer.py | 6 ---- tests/models/beit/test_modeling_beit.py | 6 ---- .../models/big_bird/test_modeling_big_bird.py | 6 ---- tests/models/blip/test_modeling_blip.py | 6 ---- tests/models/canine/test_modeling_canine.py | 6 ---- .../test_modeling_chinese_clip.py | 12 ------- tests/models/clip/test_modeling_clip.py | 6 ---- tests/models/clipseg/test_modeling_clipseg.py | 12 ------- .../data2vec/test_modeling_data2vec_vision.py | 6 ---- tests/models/dpt/test_modeling_dpt.py | 6 ---- tests/models/dpt/test_modeling_dpt_hybrid.py | 6 ---- tests/models/flava/test_modeling_flava.py | 30 ---------------- tests/models/fnet/test_modeling_fnet.py | 6 ---- tests/models/gpt2/test_modeling_gpt2.py | 6 ---- .../graphormer/test_modeling_graphormer.py | 6 ---- .../models/imagegpt/test_modeling_imagegpt.py | 6 ---- .../models/informer/test_modeling_informer.py | 6 ---- .../models/layoutlm/test_modeling_layoutlm.py | 6 ---- tests/models/lilt/test_modeling_lilt.py | 6 ---- tests/models/luke/test_modeling_luke.py | 6 ---- tests/models/marian/test_modeling_marian.py | 6 ---- tests/models/owlvit/test_modeling_owlvit.py | 12 ------- tests/models/pegasus/test_modeling_pegasus.py | 6 ---- .../pix2struct/test_modeling_pix2struct.py | 6 ---- tests/models/regnet/test_modeling_regnet.py | 6 ---- .../models/roformer/test_modeling_roformer.py | 6 ---- tests/models/sam/test_modeling_sam.py | 6 ---- .../test_modeling_speech_to_text.py | 6 ---- .../test_modeling_switch_transformers.py | 6 ---- .../test_modeling_time_series_transformer.py | 6 ---- tests/models/van/test_modeling_van.py | 6 ---- tests/models/vilt/test_modeling_vilt.py | 6 ---- .../visual_bert/test_modeling_visual_bert.py | 6 ---- tests/models/vit_mae/test_modeling_vit_mae.py | 6 ---- tests/models/x_clip/test_modeling_x_clip.py | 6 ---- tests/test_modeling_common.py | 36 +------------------ 179 files changed, 271 insertions(+), 836 deletions(-) diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py index 6aaf187b040b45..9eadaa219834ee 100755 --- a/src/transformers/models/albert/modeling_albert.py +++ b/src/transformers/models/albert/modeling_albert.py @@ -34,11 +34,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( ModelOutput, add_code_sample_docstrings, diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py index 0a7238ef606976..09ee6eca62650e 100644 --- a/src/transformers/models/align/modeling_align.py +++ b/src/transformers/models/align/modeling_align.py @@ -30,12 +30,7 @@ BaseModelOutputWithPoolingAndNoAttention, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( ModelOutput, add_start_docstrings, @@ -1105,7 +1100,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index 68a3a28a480128..26b3f59280810b 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -30,12 +30,7 @@ BaseModelOutputWithPoolingAndProjection, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ModelOutput, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from .configuration_altclip import AltCLIPConfig, AltCLIPTextConfig, AltCLIPVisionConfig @@ -656,7 +651,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, @@ -970,7 +965,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py index 15b8c379352d8a..0f8c045121c749 100644 --- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py @@ -25,7 +25,7 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, SequenceClassifierOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_audio_spectrogram_transformer import ASTConfig @@ -343,7 +343,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, layer_head_mask, diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py index 981df3ab845c3b..70587add17e721 100644 --- a/src/transformers/models/autoformer/modeling_autoformer.py +++ b/src/transformers/models/autoformer/modeling_autoformer.py @@ -34,7 +34,6 @@ Seq2SeqTSPredictionOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from .configuration_autoformer import AutoformerConfig @@ -1211,7 +1210,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, @@ -1429,7 +1428,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index 9000ad3d060266..50452449021c32 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -35,7 +35,6 @@ Seq2SeqSequenceClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( add_code_sample_docstrings, add_end_docstrings, @@ -850,7 +849,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, @@ -1106,7 +1105,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py index b546f14001911c..b17721fb2bcd32 100755 --- a/src/transformers/models/beit/modeling_beit.py +++ b/src/transformers/models/beit/modeling_beit.py @@ -34,7 +34,7 @@ SemanticSegmenterOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer, torch_custom_checkpointing +from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -517,7 +517,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, layer_head_mask, diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index 37f236d4a60291..fb92a0e84cc49e 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -40,12 +40,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -603,7 +598,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py index f20503c594dff1..f92b7a0633e8cb 100755 --- a/src/transformers/models/bert_generation/modeling_bert_generation.py +++ b/src/transformers/models/bert_generation/modeling_bert_generation.py @@ -25,12 +25,7 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -413,7 +408,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py index 5e80d0423f7443..e1346a23c9db5b 100755 --- a/src/transformers/models/big_bird/modeling_big_bird.py +++ b/src/transformers/models/big_bird/modeling_big_bird.py @@ -37,7 +37,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import apply_chunking_to_forward, torch_custom_checkpointing +from ...pytorch_utils import apply_chunking_to_forward from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -1622,7 +1622,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index 1ab72f0b49121c..8d7906631d54f2 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -36,7 +36,6 @@ Seq2SeqSequenceClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( add_code_sample_docstrings, add_end_docstrings, @@ -1946,7 +1945,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, @@ -2292,7 +2291,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py index c29c13547eb3eb..a9ecb11a61f1c2 100755 --- a/src/transformers/models/biogpt/modeling_biogpt.py +++ b/src/transformers/models/biogpt/modeling_biogpt.py @@ -32,7 +32,6 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -595,7 +594,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py index f96531f51f7684..8f2780772cbd39 100755 --- a/src/transformers/models/blenderbot/modeling_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_blenderbot.py @@ -36,7 +36,6 @@ Seq2SeqModelOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( add_end_docstrings, add_start_docstrings, @@ -780,7 +779,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, @@ -1035,7 +1034,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py index b09dce88e02e72..ef8d51a2b0e7ba 100755 --- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py @@ -34,7 +34,6 @@ Seq2SeqModelOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( add_end_docstrings, add_start_docstrings, @@ -778,7 +777,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, @@ -1032,7 +1031,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py index 93bb26c5b969f6..f16b89b7a316e7 100644 --- a/src/transformers/models/blip/modeling_blip.py +++ b/src/transformers/models/blip/modeling_blip.py @@ -25,7 +25,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( ModelOutput, add_start_docstrings, @@ -621,7 +620,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py index 38866578b6b021..1f269cf852ee0d 100644 --- a/src/transformers/models/blip/modeling_blip_text.py +++ b/src/transformers/models/blip/modeling_blip_text.py @@ -34,7 +34,6 @@ find_pruneable_heads_and_indices, prune_linear_layer, ) -from ...pytorch_utils import torch_custom_checkpointing from ...utils import logging from .configuration_blip import BlipTextConfig @@ -428,7 +427,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index b326ff36c7ef3d..82a879771b786f 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -31,12 +31,7 @@ BaseModelOutputWithPoolingAndCrossAttentions, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( ModelOutput, add_start_docstrings, @@ -497,7 +492,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, @@ -968,7 +963,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py index 2144c43687ae2b..4f6de49a144711 100644 --- a/src/transformers/models/bloom/modeling_bloom.py +++ b/src/transformers/models/bloom/modeling_bloom.py @@ -33,7 +33,6 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import logging from .configuration_bloom import BloomConfig @@ -776,7 +775,7 @@ def custom_forward(*inputs): return custom_forward - outputs = torch_custom_checkpointing( + outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(block), hidden_states, alibi, diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py index 37424e03545a93..4290241fbc097d 100644 --- a/src/transformers/models/bridgetower/modeling_bridgetower.py +++ b/src/transformers/models/bridgetower/modeling_bridgetower.py @@ -32,13 +32,8 @@ ModelOutput, SequenceClassifierOutput, ) -from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...modeling_utils import PreTrainedModel, apply_chunking_to_forward +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from .configuration_bridgetower import BridgeTowerConfig, BridgeTowerTextConfig, BridgeTowerVisionConfig @@ -815,7 +810,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py index 25d11d24e14cfb..e98840fbc6d2a6 100644 --- a/src/transformers/models/camembert/modeling_camembert.py +++ b/src/transformers/models/camembert/modeling_camembert.py @@ -35,12 +35,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -534,7 +529,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/canine/modeling_canine.py b/src/transformers/models/canine/modeling_canine.py index 8406a9d1d42fa9..a91d42f0395ee8 100644 --- a/src/transformers/models/canine/modeling_canine.py +++ b/src/transformers/models/canine/modeling_canine.py @@ -36,12 +36,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -805,7 +800,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index 975857024e337f..0adf5cfdcb1857 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -31,12 +31,7 @@ BaseModelOutputWithPoolingAndCrossAttentions, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -919,7 +914,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, @@ -1028,7 +1023,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, ) diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index fa836700066b8b..c4dbcb03f34df7 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -30,13 +30,7 @@ BaseModelOutputWithPoolingAndCrossAttentions, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - meshgrid, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, meshgrid, prune_linear_layer from ...utils import ( ModelOutput, add_start_docstrings, @@ -953,7 +947,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, input_dimensions, layer_head_mask ) else: @@ -1607,7 +1601,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 6a96715a276bcb..ee9d660ef71347 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -25,7 +25,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( ModelOutput, add_start_docstrings, @@ -645,7 +644,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index fc37277c34d5bf..85b11965306861 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -26,7 +26,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( ModelOutput, add_start_docstrings, @@ -655,7 +654,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py index 7cee097b2b1aaf..8b1d34f59e7bf6 100644 --- a/src/transformers/models/codegen/modeling_codegen.py +++ b/src/transformers/models/codegen/modeling_codegen.py @@ -24,7 +24,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_codegen import CodeGenConfig @@ -550,7 +549,7 @@ def custom_forward(*inputs): return custom_forward - outputs = torch_custom_checkpointing( + outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(block), hidden_states, None, diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py index 44d9cc9bb5994a..023cb278484193 100644 --- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py @@ -26,7 +26,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( ModelOutput, add_start_docstrings, @@ -1396,7 +1395,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, combined_attention_mask, diff --git a/src/transformers/models/convbert/modeling_convbert.py b/src/transformers/models/convbert/modeling_convbert.py index 49923ba1234ecd..bbdba210c23330 100755 --- a/src/transformers/models/convbert/modeling_convbert.py +++ b/src/transformers/models/convbert/modeling_convbert.py @@ -35,12 +35,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel, SequenceSummary -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_convbert import ConvBertConfig @@ -644,7 +639,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/cvt/modeling_cvt.py b/src/transformers/models/cvt/modeling_cvt.py index 8784fff414fb52..99e3a02febf4d2 100644 --- a/src/transformers/models/cvt/modeling_cvt.py +++ b/src/transformers/models/cvt/modeling_cvt.py @@ -26,8 +26,7 @@ from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward from ...modeling_outputs import ImageClassifierOutputWithNoAttention, ModelOutput -from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import logging from .configuration_cvt import CvtConfig diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py index 72a53c292cee82..168f342acd3200 100755 --- a/src/transformers/models/data2vec/modeling_data2vec_audio.py +++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py @@ -35,7 +35,6 @@ XVectorOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_data2vec_audio import Data2VecAudioConfig @@ -301,7 +300,7 @@ def custom_forward(*inputs): return custom_forward - hidden_states = torch_custom_checkpointing( + hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(conv_layer), hidden_states, ) @@ -601,7 +600,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer), hidden_states, attention_mask, diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py index 45c182a95c3d73..206fe1603b0045 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_text.py +++ b/src/transformers/models/data2vec/modeling_data2vec_text.py @@ -34,12 +34,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -520,7 +515,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py index cbef81d2a81bb8..77b424354892b9 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_vision.py +++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py @@ -33,7 +33,7 @@ SemanticSegmenterOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer, torch_custom_checkpointing +from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -529,7 +529,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, layer_head_mask, diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py index 260e713d5b9e78..9a0d43db3a0aec 100644 --- a/src/transformers/models/deberta/modeling_deberta.py +++ b/src/transformers/models/deberta/modeling_deberta.py @@ -31,7 +31,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import softmax_backward_data, torch_custom_checkpointing +from ...pytorch_utils import softmax_backward_data from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_deberta import DebertaConfig @@ -464,7 +464,7 @@ def custom_forward(*inputs): return custom_forward - hidden_states = torch_custom_checkpointing( + hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), next_kv, attention_mask, diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index 22aef359240422..1596ad4ffad42e 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -32,7 +32,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import softmax_backward_data, torch_custom_checkpointing +from ...pytorch_utils import softmax_backward_data from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_deberta_v2 import DebertaV2Config @@ -508,7 +508,7 @@ def custom_forward(*inputs): return custom_forward - output_states = torch_custom_checkpointing( + output_states = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), next_kv, attention_mask, diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py index 64d64191c484c0..926947b1617de8 100755 --- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py +++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py @@ -27,7 +27,7 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer, torch_custom_checkpointing +from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer from ...utils import ( ModelOutput, add_start_docstrings, @@ -643,7 +643,7 @@ def custom_forward(*inputs): return custom_forward - outputs = torch_custom_checkpointing( + outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(block), hidden_states, None, diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py index fc195622c2ddae..6469cf7a65df9e 100755 --- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -41,7 +41,7 @@ ) from ...modeling_outputs import BaseModelOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import meshgrid, torch_custom_checkpointing +from ...pytorch_utils import meshgrid from ...utils import is_ninja_available, logging from ..auto import AutoBackbone from .configuration_deformable_detr import DeformableDetrConfig @@ -1380,7 +1380,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, encoder_hidden_states, diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py index 4c5491935cad5a..8b03835812fcdf 100644 --- a/src/transformers/models/deit/modeling_deit.py +++ b/src/transformers/models/deit/modeling_deit.py @@ -33,7 +33,7 @@ MaskedImageModelingOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -364,7 +364,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, layer_head_mask, diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py index 67427b4f4137d8..af218829d6f9ab 100644 --- a/src/transformers/models/deta/modeling_deta.py +++ b/src/transformers/models/deta/modeling_deta.py @@ -36,7 +36,7 @@ ) from ...modeling_outputs import BaseModelOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import meshgrid, torch_custom_checkpointing +from ...pytorch_utils import meshgrid from ...utils import is_torchvision_available, logging, requires_backends from ..auto import AutoBackbone from .configuration_deta import DetaConfig @@ -1272,7 +1272,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, encoder_hidden_states, diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index 684129663fa846..c92c43e46d18e9 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -26,7 +26,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( ModelOutput, add_start_docstrings, @@ -1131,7 +1130,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, combined_attention_mask, diff --git a/src/transformers/models/donut/modeling_donut_swin.py b/src/transformers/models/donut/modeling_donut_swin.py index 07f9fee14ed656..65c48eb81f8368 100644 --- a/src/transformers/models/donut/modeling_donut_swin.py +++ b/src/transformers/models/donut/modeling_donut_swin.py @@ -28,7 +28,7 @@ from ...activations import ACT2FN from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer, torch_custom_checkpointing +from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -756,7 +756,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, input_dimensions, layer_head_mask ) else: diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py index 0630a3c48be941..187a6c36656a8e 100755 --- a/src/transformers/models/dpt/modeling_dpt.py +++ b/src/transformers/models/dpt/modeling_dpt.py @@ -39,7 +39,7 @@ ) from ...modeling_outputs import BaseModelOutput, DepthEstimatorOutput, SemanticSegmenterOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ModelOutput, logging from ..auto import AutoBackbone from .configuration_dpt import DPTConfig @@ -535,7 +535,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, layer_head_mask, diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py index 3197e060bd1a7f..a7ee4ec9320204 100644 --- a/src/transformers/models/electra/modeling_electra.py +++ b/src/transformers/models/electra/modeling_electra.py @@ -36,12 +36,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel, SequenceSummary -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -581,7 +576,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py index a5f16a3a867f7f..b8df1b2d5035c3 100644 --- a/src/transformers/models/ernie/modeling_ernie.py +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -38,12 +38,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -516,7 +511,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py index 27b7bb2d917cb2..e0b26e0f7812b7 100755 --- a/src/transformers/models/esm/modeling_esm.py +++ b/src/transformers/models/esm/modeling_esm.py @@ -30,8 +30,7 @@ SequenceClassifierOutput, TokenClassifierOutput, ) -from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing +from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import logging from .configuration_esm import EsmConfig @@ -611,7 +610,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py index 0f85ff06f58561..5d49197f8ca50e 100644 --- a/src/transformers/models/flava/modeling_flava.py +++ b/src/transformers/models/flava/modeling_flava.py @@ -26,8 +26,7 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling -from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing +from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -669,7 +668,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py index 8d8de88c8f1d6b..6bc526eeebcb91 100755 --- a/src/transformers/models/fnet/modeling_fnet.py +++ b/src/transformers/models/fnet/modeling_fnet.py @@ -43,7 +43,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import apply_chunking_to_forward, torch_custom_checkpointing +from ...pytorch_utils import apply_chunking_to_forward from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -297,7 +297,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing(create_custom_forward(layer_module), hidden_states) + layer_outputs = torch.utils.checkpoint.checkpoint(create_custom_forward(layer_module), hidden_states) else: layer_outputs = layer_module(hidden_states) diff --git a/src/transformers/models/focalnet/modeling_focalnet.py b/src/transformers/models/focalnet/modeling_focalnet.py index 9e8efed44388db..fc327ad0b39f8c 100644 --- a/src/transformers/models/focalnet/modeling_focalnet.py +++ b/src/transformers/models/focalnet/modeling_focalnet.py @@ -28,7 +28,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BackboneOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -594,7 +593,7 @@ def custom_forward(*inputs): return custom_forward - stage_outputs = torch_custom_checkpointing( + stage_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(stage_module), hidden_states, input_dimensions, diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index 83bf591fdb9885..23ae6d64962fe7 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -34,12 +34,7 @@ CausalLMOutputWithPast, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from .configuration_git import GitConfig, GitVisionConfig @@ -462,7 +457,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, @@ -888,7 +883,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index dab2c613532bde..b9a8568f00e7fd 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -35,12 +35,8 @@ SequenceClassifierOutputWithPast, TokenClassifierOutput, ) -from ...modeling_utils import Conv1D, PreTrainedModel, SequenceSummary -from ...pytorch_utils import ( - find_pruneable_heads_and_indices, - prune_conv1d_layer, - torch_custom_checkpointing, -) +from ...modeling_utils import PreTrainedModel, SequenceSummary +from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -894,7 +890,7 @@ def custom_forward(*inputs): return custom_forward - outputs = torch_custom_checkpointing( + outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(block), hidden_states, None, diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py index cf23e1ba08a512..705d07b1da257f 100644 --- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py @@ -28,7 +28,6 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -662,7 +661,7 @@ def custom_forward(*inputs): return custom_forward - outputs = torch_custom_checkpointing( + outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(block), hidden_states, None, diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py index 768893cb447148..b67f4ddbfacac3 100755 --- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py +++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py @@ -34,7 +34,6 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_gpt_neo import GPTNeoConfig @@ -614,7 +613,7 @@ def custom_forward(*inputs): return custom_forward - outputs = torch_custom_checkpointing( + outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(block), hidden_states, None, diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index 3f7bbcdf64e601..7c3bfd1035f904 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -36,7 +36,6 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import logging from .configuration_gpt_neox import GPTNeoXConfig @@ -558,7 +557,7 @@ def custom_forward(*inputs): return custom_forward - outputs = torch_custom_checkpointing( + outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer), hidden_states, attention_mask, diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index 4969bd7fd1bb1c..de120167989d84 100644 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -31,7 +31,6 @@ SequenceClassifierOutputWithPast, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -678,7 +677,7 @@ def custom_forward(*inputs): return custom_forward - outputs = torch_custom_checkpointing( + outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(block), hidden_states, None, diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py index e5ee94adbd2699..c19ebd13b91d6f 100644 --- a/src/transformers/models/groupvit/modeling_groupvit.py +++ b/src/transformers/models/groupvit/modeling_groupvit.py @@ -28,7 +28,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( ModelOutput, add_start_docstrings, @@ -1038,7 +1037,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index 774c4826c9bb94..70a8c079409b51 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -27,7 +27,6 @@ from ...deepspeed import is_deepspeed_zero3_enabled from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -354,7 +353,7 @@ def custom_forward(*inputs): return custom_forward - hidden_states = torch_custom_checkpointing( + hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(conv_layer), hidden_states, ) @@ -739,7 +738,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer), hidden_states, attention_mask, @@ -829,7 +828,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer), hidden_states, attention_mask, diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py index 31b911431f92ca..539119fabf281d 100755 --- a/src/transformers/models/imagegpt/modeling_imagegpt.py +++ b/src/transformers/models/imagegpt/modeling_imagegpt.py @@ -32,7 +32,7 @@ SequenceClassifierOutputWithPast, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer, torch_custom_checkpointing +from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from .configuration_imagegpt import ImageGPTConfig @@ -826,7 +826,7 @@ def custom_forward(*inputs): return custom_forward - outputs = torch_custom_checkpointing( + outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(block), hidden_states, None, diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py index 4774f1d91d6655..2bf3f208a903fd 100644 --- a/src/transformers/models/informer/modeling_informer.py +++ b/src/transformers/models/informer/modeling_informer.py @@ -30,7 +30,6 @@ Seq2SeqTSPredictionOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from .configuration_informer import InformerConfig @@ -1218,14 +1217,14 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, (head_mask[idx] if head_mask is not None else None), ) if conv_layer is not None: - output = torch_custom_checkpointing(conv_layer, layer_outputs[0]) + output = torch.utils.checkpoint.checkpoint(conv_layer, layer_outputs[0]) layer_outputs = (output,) + layer_outputs[1:] else: layer_outputs = encoder_layer( @@ -1441,7 +1440,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py index 614bebe121961c..410f76509422f3 100644 --- a/src/transformers/models/layoutlm/modeling_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_layoutlm.py @@ -33,12 +33,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from .configuration_layoutlm import LayoutLMConfig @@ -497,7 +492,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py index 0e0f2c1bd82361..5a6f39ce31a6e1 100755 --- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py @@ -31,7 +31,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import apply_chunking_to_forward, torch_custom_checkpointing +from ...pytorch_utils import apply_chunking_to_forward from ...utils import ( add_start_docstrings, add_start_docstrings_to_model_forward, @@ -455,7 +455,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py index 31fb1f6fb5728f..db6618caaeaf30 100644 --- a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py @@ -32,7 +32,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import apply_chunking_to_forward, torch_custom_checkpointing +from ...pytorch_utils import apply_chunking_to_forward from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from .configuration_layoutlmv3 import LayoutLMv3Config @@ -671,7 +671,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py index 8fa8c00aadf736..a11659e3893389 100755 --- a/src/transformers/models/led/modeling_led.py +++ b/src/transformers/models/led/modeling_led.py @@ -35,7 +35,6 @@ Seq2SeqSequenceClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -1885,7 +1884,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, @@ -2151,7 +2150,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, combined_attention_mask, diff --git a/src/transformers/models/lilt/modeling_lilt.py b/src/transformers/models/lilt/modeling_lilt.py index 1953992d058fb3..74454d244e8d31 100644 --- a/src/transformers/models/lilt/modeling_lilt.py +++ b/src/transformers/models/lilt/modeling_lilt.py @@ -31,12 +31,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from .configuration_lilt import LiltConfig @@ -524,7 +519,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, layout_inputs, diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 2468f7088ba455..c9debdd252dc7a 100755 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -29,7 +29,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from .configuration_llama import LlamaConfig @@ -569,7 +568,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py index 809d889eed47b4..665e2cb56421b6 100755 --- a/src/transformers/models/longformer/modeling_longformer.py +++ b/src/transformers/models/longformer/modeling_longformer.py @@ -25,12 +25,7 @@ from ...activations import ACT2FN, gelu from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -1316,7 +1311,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py index d1358a78d8f536..1a49444e8a509c 100644 --- a/src/transformers/models/longt5/modeling_longt5.py +++ b/src/transformers/models/longt5/modeling_longt5.py @@ -23,6 +23,7 @@ import torch from torch import nn from torch.nn import CrossEntropyLoss +from torch.utils.checkpoint import checkpoint from ...activations import ACT2FN from ...modeling_outputs import ( @@ -32,12 +33,7 @@ Seq2SeqModelOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - ALL_LAYERNORM_LAYERS, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( DUMMY_INPUTS, DUMMY_MASK, @@ -1521,7 +1517,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = checkpoint( create_custom_forward(layer_module), hidden_states, extended_attention_mask, diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py index 0f217909b0ca31..ba21d3deb32e8d 100644 --- a/src/transformers/models/luke/modeling_luke.py +++ b/src/transformers/models/luke/modeling_luke.py @@ -26,7 +26,7 @@ from ...activations import ACT2FN, gelu from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import apply_chunking_to_forward, torch_custom_checkpointing +from ...pytorch_utils import apply_chunking_to_forward from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -795,7 +795,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), word_hidden_states, entity_hidden_states, diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py index db8e017d17f31f..f8f9e1d3a8ee3d 100755 --- a/src/transformers/models/m2m_100/modeling_m2m_100.py +++ b/src/transformers/models/m2m_100/modeling_m2m_100.py @@ -32,7 +32,6 @@ Seq2SeqModelOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( add_code_sample_docstrings, add_end_docstrings, @@ -828,7 +827,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, @@ -1075,7 +1074,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, combined_attention_mask, diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py index 15d58baeda6ac7..a75f833fb5cb87 100755 --- a/src/transformers/models/marian/modeling_marian.py +++ b/src/transformers/models/marian/modeling_marian.py @@ -35,7 +35,6 @@ Seq2SeqModelOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( add_end_docstrings, add_start_docstrings, @@ -791,7 +790,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, @@ -1040,7 +1039,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py index 0792ff1b723eec..0c6847b47815ce 100755 --- a/src/transformers/models/markuplm/modeling_markuplm.py +++ b/src/transformers/models/markuplm/modeling_markuplm.py @@ -43,7 +43,6 @@ find_pruneable_heads_and_indices, prune_linear_layer, ) -from ...pytorch_utils import torch_custom_checkpointing from ...utils import logging from .configuration_markuplm import MarkupLMConfig @@ -654,7 +653,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py index 2fd61a179b1bc4..4cb2493e58c8bb 100644 --- a/src/transformers/models/mask2former/modeling_mask2former.py +++ b/src/transformers/models/mask2former/modeling_mask2former.py @@ -36,7 +36,6 @@ ) from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import logging from .configuration_mask2former import Mask2FormerConfig @@ -1876,7 +1875,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/maskformer/modeling_maskformer.py b/src/transformers/models/maskformer/modeling_maskformer.py index 2b91e975ce1ece..830f8b23c81602 100644 --- a/src/transformers/models/maskformer/modeling_maskformer.py +++ b/src/transformers/models/maskformer/modeling_maskformer.py @@ -28,7 +28,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutputWithCrossAttentions from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( ModelOutput, add_start_docstrings, @@ -777,7 +776,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, combined_attention_mask, diff --git a/src/transformers/models/maskformer/modeling_maskformer_swin.py b/src/transformers/models/maskformer/modeling_maskformer_swin.py index e22f466edce82e..7016b598e8535b 100644 --- a/src/transformers/models/maskformer/modeling_maskformer_swin.py +++ b/src/transformers/models/maskformer/modeling_maskformer_swin.py @@ -28,7 +28,7 @@ from ...file_utils import ModelOutput from ...modeling_outputs import BackboneOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer, torch_custom_checkpointing +from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer from ...utils.backbone_utils import BackboneMixin from .configuration_maskformer_swin import MaskFormerSwinConfig @@ -695,7 +695,7 @@ def custom_forward(*inputs): return custom_forward - layer_hidden_states, output_dimensions, layer_all_hidden_states = torch_custom_checkpointing( + layer_hidden_states, output_dimensions, layer_all_hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, layer_head_mask ) else: diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py index 660177708a835e..67750ab42f7118 100755 --- a/src/transformers/models/mbart/modeling_mbart.py +++ b/src/transformers/models/mbart/modeling_mbart.py @@ -34,7 +34,6 @@ Seq2SeqSequenceClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( add_code_sample_docstrings, add_end_docstrings, @@ -832,7 +831,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, @@ -1090,7 +1089,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/mctct/modeling_mctct.py b/src/transformers/models/mctct/modeling_mctct.py index 22838d4e28d0e1..08e280b3ccf9b2 100755 --- a/src/transformers/models/mctct/modeling_mctct.py +++ b/src/transformers/models/mctct/modeling_mctct.py @@ -33,7 +33,6 @@ find_pruneable_heads_and_indices, prune_linear_layer, ) -from ...pytorch_utils import torch_custom_checkpointing from ...utils import logging from .configuration_mctct import MCTCTConfig @@ -624,7 +623,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py index 9a24f41e70b0dc..bba7e7369cb8a1 100755 --- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py +++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py @@ -40,12 +40,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -561,7 +556,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/mobilevit/modeling_mobilevit.py b/src/transformers/models/mobilevit/modeling_mobilevit.py index e68357e6d37dc0..3503e86c9c75c2 100755 --- a/src/transformers/models/mobilevit/modeling_mobilevit.py +++ b/src/transformers/models/mobilevit/modeling_mobilevit.py @@ -33,7 +33,7 @@ SemanticSegmenterOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -633,7 +633,7 @@ def custom_forward(*inputs): return custom_forward - hidden_states = torch_custom_checkpointing( + hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, ) diff --git a/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py index bd2f2bd9cf6ff2..b8c071a74f4b1e 100644 --- a/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py +++ b/src/transformers/models/mobilevitv2/modeling_mobilevitv2.py @@ -32,7 +32,6 @@ SemanticSegmenterOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -590,7 +589,7 @@ def custom_forward(*inputs): return custom_forward - hidden_states = torch_custom_checkpointing( + hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, ) diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py index ce5c81f63425b5..a3cfce8ffc4a3f 100644 --- a/src/transformers/models/mt5/modeling_mt5.py +++ b/src/transformers/models/mt5/modeling_mt5.py @@ -23,6 +23,7 @@ import torch from torch import nn from torch.nn import CrossEntropyLoss +from torch.utils.checkpoint import checkpoint from ...activations import ACT2FN from ...modeling_outputs import ( @@ -32,7 +33,7 @@ Seq2SeqModelOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( DUMMY_INPUTS, DUMMY_MASK, @@ -1045,7 +1046,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = checkpoint( create_custom_forward(layer_module), hidden_states, extended_attention_mask, diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py index 4f905a7b51ed60..6a44768d8eec86 100644 --- a/src/transformers/models/mvp/modeling_mvp.py +++ b/src/transformers/models/mvp/modeling_mvp.py @@ -34,7 +34,6 @@ Seq2SeqSequenceClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( add_code_sample_docstrings, add_end_docstrings, @@ -954,7 +953,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, @@ -1232,7 +1231,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/nezha/modeling_nezha.py b/src/transformers/models/nezha/modeling_nezha.py index 68a78a64faeb14..97c5b5a90ec3b5 100644 --- a/src/transformers/models/nezha/modeling_nezha.py +++ b/src/transformers/models/nezha/modeling_nezha.py @@ -38,12 +38,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -589,7 +584,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py index d67032d119bcb0..06b61c7497dbe3 100644 --- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py +++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py @@ -22,6 +22,7 @@ import torch import torch.nn as nn from torch.nn import CrossEntropyLoss +from torch.utils.checkpoint import checkpoint from ...activations import ACT2FN from ...deepspeed import is_deepspeed_zero3_enabled @@ -32,7 +33,6 @@ Seq2SeqMoEOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( add_end_docstrings, add_start_docstrings, @@ -1155,7 +1155,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, @@ -1428,7 +1428,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = checkpoint( create_custom_forward(decoder_layer), hidden_states, combined_attention_mask, diff --git a/src/transformers/models/nystromformer/modeling_nystromformer.py b/src/transformers/models/nystromformer/modeling_nystromformer.py index 6bbd95e7091da6..b859b0db1d4f4f 100755 --- a/src/transformers/models/nystromformer/modeling_nystromformer.py +++ b/src/transformers/models/nystromformer/modeling_nystromformer.py @@ -33,12 +33,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_nystromformer import NystromformerConfig @@ -380,7 +375,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/oneformer/modeling_oneformer.py b/src/transformers/models/oneformer/modeling_oneformer.py index 1e2c59a717d5fd..a874611acde892 100644 --- a/src/transformers/models/oneformer/modeling_oneformer.py +++ b/src/transformers/models/oneformer/modeling_oneformer.py @@ -28,7 +28,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( ModelOutput, add_start_docstrings, @@ -2620,7 +2619,7 @@ def __init__( def forward(self, hidden_states: torch.Tensor): for layer in self.layers: if self.use_checkpoint: - hidden_states = torch_custom_checkpointing(layer, hidden_states) + hidden_states = torch.utils.checkpoint.checkpoint(layer, hidden_states) else: hidden_states = layer(hidden_states) return hidden_states diff --git a/src/transformers/models/open_llama/modeling_open_llama.py b/src/transformers/models/open_llama/modeling_open_llama.py index 07b19a808de316..16ad554dc31344 100644 --- a/src/transformers/models/open_llama/modeling_open_llama.py +++ b/src/transformers/models/open_llama/modeling_open_llama.py @@ -29,7 +29,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from .configuration_open_llama import OpenLlamaConfig @@ -604,7 +603,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py index 79b555a5e3e0a2..bd64630c6200f5 100644 --- a/src/transformers/models/opt/modeling_opt.py +++ b/src/transformers/models/opt/modeling_opt.py @@ -29,7 +29,6 @@ SequenceClassifierOutputWithPast, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -701,7 +700,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, causal_attention_mask, diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py index 6f43ea603e3f35..f65a0688578e2d 100644 --- a/src/transformers/models/owlvit/modeling_owlvit.py +++ b/src/transformers/models/owlvit/modeling_owlvit.py @@ -27,7 +27,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( ModelOutput, add_start_docstrings, @@ -755,7 +754,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py index 3af479971e1893..a2bd3f3812e550 100755 --- a/src/transformers/models/pegasus/modeling_pegasus.py +++ b/src/transformers/models/pegasus/modeling_pegasus.py @@ -34,7 +34,6 @@ Seq2SeqModelOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( add_end_docstrings, add_start_docstrings, @@ -806,7 +805,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, @@ -1090,7 +1089,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/pegasus_x/modeling_pegasus_x.py b/src/transformers/models/pegasus_x/modeling_pegasus_x.py index 94fc2d25ddcba8..8e380a4de5f0a0 100755 --- a/src/transformers/models/pegasus_x/modeling_pegasus_x.py +++ b/src/transformers/models/pegasus_x/modeling_pegasus_x.py @@ -33,7 +33,6 @@ Seq2SeqModelOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( add_end_docstrings, add_start_docstrings, @@ -1073,7 +1072,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, global_hidden_states, @@ -1331,7 +1330,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py index 0834bbeaaf7bdd..2db104a5a112af 100644 --- a/src/transformers/models/pix2struct/modeling_pix2struct.py +++ b/src/transformers/models/pix2struct/modeling_pix2struct.py @@ -20,6 +20,7 @@ import torch import torch.utils.checkpoint from torch import nn +from torch.utils.checkpoint import checkpoint from ...activations import ACT2FN from ...modeling_outputs import ( @@ -30,7 +31,7 @@ Seq2SeqModelOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ALL_LAYERNORM_LAYERS, torch_custom_checkpointing +from ...pytorch_utils import ALL_LAYERNORM_LAYERS from ...utils import ( DUMMY_INPUTS, DUMMY_MASK, @@ -349,7 +350,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, @@ -1501,7 +1502,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = checkpoint( create_custom_forward(layer_module), hidden_states, extended_attention_mask, diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py index 23a9f928d193fe..365429360af508 100644 --- a/src/transformers/models/plbart/modeling_plbart.py +++ b/src/transformers/models/plbart/modeling_plbart.py @@ -33,7 +33,6 @@ Seq2SeqSequenceClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( add_code_sample_docstrings, add_end_docstrings, @@ -811,7 +810,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, @@ -1068,7 +1067,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py index 007d9aadf268ba..9160d5e1eb462d 100644 --- a/src/transformers/models/prophetnet/modeling_prophetnet.py +++ b/src/transformers/models/prophetnet/modeling_prophetnet.py @@ -28,7 +28,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( ModelOutput, add_start_docstrings, @@ -1337,7 +1336,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, extended_attention_mask, @@ -1578,7 +1577,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, extended_attention_mask, diff --git a/src/transformers/models/qdqbert/modeling_qdqbert.py b/src/transformers/models/qdqbert/modeling_qdqbert.py index d4371c0efbb250..47a34e959072fa 100755 --- a/src/transformers/models/qdqbert/modeling_qdqbert.py +++ b/src/transformers/models/qdqbert/modeling_qdqbert.py @@ -39,7 +39,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -586,7 +586,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py index 09d6bb7325b4ec..f68fc04105de6a 100644 --- a/src/transformers/models/realm/modeling_realm.py +++ b/src/transformers/models/realm/modeling_realm.py @@ -31,12 +31,7 @@ ModelOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from .configuration_realm import RealmConfig @@ -596,7 +591,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py index 06da821d4dd37a..da4ad9608514c7 100755 --- a/src/transformers/models/rembert/modeling_rembert.py +++ b/src/transformers/models/rembert/modeling_rembert.py @@ -36,12 +36,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -553,7 +548,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/retribert/modeling_retribert.py b/src/transformers/models/retribert/modeling_retribert.py index e1397d39ceae2c..240d9476e70b01 100644 --- a/src/transformers/models/retribert/modeling_retribert.py +++ b/src/transformers/models/retribert/modeling_retribert.py @@ -21,10 +21,10 @@ from typing import Optional import torch +import torch.utils.checkpoint as checkpoint from torch import nn from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import add_start_docstrings, logging from ..bert.modeling_bert import BertModel from .configuration_retribert import RetriBertConfig @@ -141,7 +141,7 @@ def partial_encode(*inputs): for b in range(math.ceil(input_ids.shape[0] / checkpoint_batch_size)): b_embedding_output = embedding_output[b * checkpoint_batch_size : (b + 1) * checkpoint_batch_size] b_attention_mask = extended_attention_mask[b * checkpoint_batch_size : (b + 1) * checkpoint_batch_size] - pooled_output = torch_custom_checkpointing(partial_encode, b_embedding_output, b_attention_mask) + pooled_output = checkpoint.checkpoint(partial_encode, b_embedding_output, b_attention_mask) pooled_output_list.append(pooled_output) return torch.cat(pooled_output_list, dim=0) diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py index f86fa4aa80820c..b0f13692460166 100644 --- a/src/transformers/models/roberta/modeling_roberta.py +++ b/src/transformers/models/roberta/modeling_roberta.py @@ -35,12 +35,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -520,7 +515,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py index 01276cd07119a6..b1e02e27f13890 100644 --- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py @@ -35,12 +35,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -522,7 +517,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py index 63abc9d4aa1876..7647c14a9ea3d4 100644 --- a/src/transformers/models/roc_bert/modeling_roc_bert.py +++ b/src/transformers/models/roc_bert/modeling_roc_bert.py @@ -35,12 +35,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -654,7 +649,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py index 586ecbd2dad690..b966bf4490a9fd 100644 --- a/src/transformers/models/roformer/modeling_roformer.py +++ b/src/transformers/models/roformer/modeling_roformer.py @@ -36,12 +36,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel, SequenceSummary -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -590,7 +585,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py index 0e4177d90b0a1a..c3cbaa9176f0bf 100644 --- a/src/transformers/models/sam/modeling_sam.py +++ b/src/transformers/models/sam/modeling_sam.py @@ -28,7 +28,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_sam import SamConfig, SamMaskDecoderConfig, SamPromptEncoderConfig, SamVisionConfig @@ -1050,7 +1049,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, ) diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index 75ad1f97dffe8c..dd854c49f5c9d2 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -28,7 +28,6 @@ from ...deepspeed import is_deepspeed_zero3_enabled from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_sew import SEWConfig @@ -368,7 +367,7 @@ def custom_forward(*inputs): return custom_forward - hidden_states = torch_custom_checkpointing( + hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(conv_layer), hidden_states, ) @@ -681,7 +680,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer), hidden_states, attention_mask, diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py index b7acb306bb91c5..7f7c1977d69248 100644 --- a/src/transformers/models/sew_d/modeling_sew_d.py +++ b/src/transformers/models/sew_d/modeling_sew_d.py @@ -29,7 +29,7 @@ from ...deepspeed import is_deepspeed_zero3_enabled from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import softmax_backward_data, torch_custom_checkpointing +from ...pytorch_utils import softmax_backward_data from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_sew_d import SEWDConfig @@ -460,7 +460,7 @@ def custom_forward(*inputs): return custom_forward - hidden_states = torch_custom_checkpointing( + hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(conv_layer), hidden_states, ) @@ -1141,7 +1141,7 @@ def custom_forward(*inputs): return custom_forward - output_states = torch_custom_checkpointing( + output_states = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), next_kv, attention_mask, diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index 3e2024dc69ecf1..d8a19084eb3847 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -31,7 +31,6 @@ Seq2SeqModelOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from .configuration_speech_to_text import Speech2TextConfig @@ -821,7 +820,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, @@ -1069,7 +1068,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py index 12e8d4592adb65..c13b04642d9d54 100755 --- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py +++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py @@ -27,7 +27,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import add_start_docstrings, logging, replace_return_docstrings from .configuration_speech_to_text_2 import Speech2Text2Config @@ -678,7 +677,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py index 5988607f1cb4f2..3e8ce5a23b7e6b 100644 --- a/src/transformers/models/speecht5/modeling_speecht5.py +++ b/src/transformers/models/speecht5/modeling_speecht5.py @@ -35,7 +35,6 @@ Seq2SeqSpectrogramOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from .configuration_speecht5 import SpeechT5Config, SpeechT5HifiGanConfig @@ -529,7 +528,7 @@ def custom_forward(*inputs): return custom_forward - hidden_states = torch_custom_checkpointing( + hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(conv_layer), hidden_states, ) @@ -1395,7 +1394,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, @@ -1724,7 +1723,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py index 88d6a480b70557..6e636fb695daef 100755 --- a/src/transformers/models/splinter/modeling_splinter.py +++ b/src/transformers/models/splinter/modeling_splinter.py @@ -27,12 +27,7 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, ModelOutput, QuestionAnsweringModelOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_splinter import SplinterConfig @@ -469,7 +464,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/swin/modeling_swin.py b/src/transformers/models/swin/modeling_swin.py index 93144c66a9134a..b324cfdcd9354c 100644 --- a/src/transformers/models/swin/modeling_swin.py +++ b/src/transformers/models/swin/modeling_swin.py @@ -29,7 +29,7 @@ from ...activations import ACT2FN from ...modeling_outputs import BackboneOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer, torch_custom_checkpointing +from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -832,7 +832,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, input_dimensions, layer_head_mask ) else: diff --git a/src/transformers/models/swin2sr/modeling_swin2sr.py b/src/transformers/models/swin2sr/modeling_swin2sr.py index 6b1b803345557e..cd58b706505865 100644 --- a/src/transformers/models/swin2sr/modeling_swin2sr.py +++ b/src/transformers/models/swin2sr/modeling_swin2sr.py @@ -27,7 +27,7 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, ImageSuperResolutionOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer, torch_custom_checkpointing +from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -753,7 +753,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(stage_module), hidden_states, input_dimensions, layer_head_mask ) else: diff --git a/src/transformers/models/swinv2/modeling_swinv2.py b/src/transformers/models/swinv2/modeling_swinv2.py index 07dd0a79b7ae5a..97b460479d6d5d 100644 --- a/src/transformers/models/swinv2/modeling_swinv2.py +++ b/src/transformers/models/swinv2/modeling_swinv2.py @@ -28,7 +28,7 @@ from ...activations import ACT2FN from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer, torch_custom_checkpointing +from ...pytorch_utils import find_pruneable_heads_and_indices, meshgrid, prune_linear_layer from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -908,7 +908,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, input_dimensions, layer_head_mask ) else: diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py index 1378ec9a98d5ac..008e23531ac1a9 100644 --- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py +++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py @@ -23,6 +23,7 @@ import torch import torch.nn as nn from torch.nn import CrossEntropyLoss +from torch.utils.checkpoint import checkpoint from ...activations import ACT2FN from ...modeling_outputs import ( @@ -32,12 +33,7 @@ Seq2SeqMoEOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - ALL_LAYERNORM_LAYERS, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( DUMMY_INPUTS, DUMMY_MASK, @@ -1079,7 +1075,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = checkpoint( create_custom_forward(layer_module), hidden_states, extended_attention_mask, diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index 4531214e19cd16..050309fa9a3367 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -24,6 +24,7 @@ import torch from torch import nn from torch.nn import CrossEntropyLoss +from torch.utils.checkpoint import checkpoint from ...activations import ACT2FN from ...modeling_outputs import ( @@ -33,12 +34,7 @@ Seq2SeqModelOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - ALL_LAYERNORM_LAYERS, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( DUMMY_INPUTS, DUMMY_MASK, @@ -1078,7 +1074,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = checkpoint( create_custom_forward(layer_module), hidden_states, extended_attention_mask, diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py index 998f21a286109b..733ff7b9b453df 100644 --- a/src/transformers/models/table_transformer/modeling_table_transformer.py +++ b/src/transformers/models/table_transformer/modeling_table_transformer.py @@ -26,7 +26,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithCrossAttentions, Seq2SeqModelOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( ModelOutput, add_start_docstrings, @@ -1075,7 +1074,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, combined_attention_mask, diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py index 4f736a367e3022..1621653f3ee08b 100644 --- a/src/transformers/models/tapas/modeling_tapas.py +++ b/src/transformers/models/tapas/modeling_tapas.py @@ -34,7 +34,6 @@ find_pruneable_heads_and_indices, is_torch_greater_or_equal_than_1_12, prune_linear_layer, - torch_custom_checkpointing, ) from ...utils import ( ModelOutput, @@ -654,7 +653,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py index e3e0b3055d8b78..8986ef6729caaf 100644 --- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py +++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py @@ -31,7 +31,6 @@ Seq2SeqTSPredictionOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from .configuration_time_series_transformer import TimeSeriesTransformerConfig @@ -950,7 +949,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, @@ -1167,7 +1166,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/timesformer/modeling_timesformer.py b/src/transformers/models/timesformer/modeling_timesformer.py index 5ff5bd7fd19d89..9f886b6ece5371 100644 --- a/src/transformers/models/timesformer/modeling_timesformer.py +++ b/src/transformers/models/timesformer/modeling_timesformer.py @@ -27,7 +27,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from .configuration_timesformer import TimesformerConfig @@ -447,7 +446,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, ) diff --git a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py b/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py index 1027bd73f3fe96..e8ecedccb5ea50 100644 --- a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py +++ b/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py @@ -26,7 +26,6 @@ from torch.nn import functional as F from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( ModelOutput, add_start_docstrings, @@ -557,7 +556,7 @@ def custom_forward(*inputs): return custom_forward - outputs = torch_custom_checkpointing( + outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(block), hidden_states, layer_past, diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py index e8ee10f7defdb3..6276c68a425d10 100644 --- a/src/transformers/models/trocr/modeling_trocr.py +++ b/src/transformers/models/trocr/modeling_trocr.py @@ -27,7 +27,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import add_start_docstrings, logging, replace_return_docstrings from .configuration_trocr import TrOCRConfig @@ -710,7 +709,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/tvlt/modeling_tvlt.py b/src/transformers/models/tvlt/modeling_tvlt.py index 4b990cdb03ebee..3725c5e7728be9 100644 --- a/src/transformers/models/tvlt/modeling_tvlt.py +++ b/src/transformers/models/tvlt/modeling_tvlt.py @@ -29,7 +29,7 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, SequenceClassifierOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( ModelOutput, add_start_docstrings, @@ -567,7 +567,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, @@ -884,7 +884,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, None, diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index 5bd1af95c75abe..e068fa59e5792e 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -29,7 +29,6 @@ from ...deepspeed import is_deepspeed_zero3_enabled from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput, Wav2Vec2BaseModelOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -392,7 +391,7 @@ def custom_forward(*inputs): return custom_forward - hidden_states = torch_custom_checkpointing( + hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(conv_layer), hidden_states, ) @@ -775,7 +774,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer), hidden_states, attention_mask, @@ -865,7 +864,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer), hidden_states, attention_mask, diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index f603d2712f6d25..2ed8a5d57204e7 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -36,7 +36,6 @@ XVectorOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -406,7 +405,7 @@ def custom_forward(*inputs): return custom_forward - hidden_states = torch_custom_checkpointing( + hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(conv_layer), hidden_states, ) @@ -789,7 +788,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer), hidden_states, attention_mask, @@ -879,7 +878,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer), hidden_states, attention_mask, diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py index 5f44a5e4b3aade..c62d0c4632cb68 100644 --- a/src/transformers/models/videomae/modeling_videomae.py +++ b/src/transformers/models/videomae/modeling_videomae.py @@ -30,7 +30,7 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( ModelOutput, add_start_docstrings, @@ -441,7 +441,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, layer_head_mask, @@ -724,7 +724,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, None, diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py index 5499a26cc748ab..6ee1e396a625e3 100755 --- a/src/transformers/models/vilt/modeling_vilt.py +++ b/src/transformers/models/vilt/modeling_vilt.py @@ -38,7 +38,6 @@ find_pruneable_heads_and_indices, meshgrid, prune_linear_layer, - torch_custom_checkpointing, ) from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from .configuration_vilt import ViltConfig @@ -537,7 +536,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py index a73d6ac720725b..0bef6e4af9d918 100755 --- a/src/transformers/models/visual_bert/modeling_visual_bert.py +++ b/src/transformers/models/visual_bert/modeling_visual_bert.py @@ -32,12 +32,7 @@ SequenceClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( ModelOutput, add_start_docstrings, @@ -428,7 +423,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py index 28ea5740ca59f6..bfd440caae2b01 100644 --- a/src/transformers/models/vit/modeling_vit.py +++ b/src/transformers/models/vit/modeling_vit.py @@ -32,7 +32,7 @@ MaskedImageModelingOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -404,7 +404,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, layer_head_mask, diff --git a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py index ba3bbddf563b0d..051d431946a852 100644 --- a/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py +++ b/src/transformers/models/vit_hybrid/modeling_vit_hybrid.py @@ -27,7 +27,7 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from ..auto import AutoBackbone from .configuration_vit_hybrid import ViTHybridConfig @@ -422,7 +422,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, layer_head_mask, diff --git a/src/transformers/models/vit_mae/modeling_vit_mae.py b/src/transformers/models/vit_mae/modeling_vit_mae.py index 5a9c539fbc0b47..ef0c7c9f36869e 100755 --- a/src/transformers/models/vit_mae/modeling_vit_mae.py +++ b/src/transformers/models/vit_mae/modeling_vit_mae.py @@ -29,7 +29,7 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( ModelOutput, add_start_docstrings, @@ -543,7 +543,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, layer_head_mask, @@ -800,7 +800,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, None, diff --git a/src/transformers/models/vit_msn/modeling_vit_msn.py b/src/transformers/models/vit_msn/modeling_vit_msn.py index 4f7b412fecb83b..46639e7d622cb7 100644 --- a/src/transformers/models/vit_msn/modeling_vit_msn.py +++ b/src/transformers/models/vit_msn/modeling_vit_msn.py @@ -27,7 +27,7 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings from .configuration_vit_msn import ViTMSNConfig @@ -394,7 +394,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, layer_head_mask, diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index 9705a51e488a09..43ab2408bb2309 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -37,7 +37,6 @@ XVectorOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -459,7 +458,7 @@ def custom_forward(*inputs): return custom_forward - hidden_states = torch_custom_checkpointing( + hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(conv_layer), hidden_states, ) @@ -811,7 +810,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer), hidden_states, attention_mask, @@ -900,7 +899,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer), hidden_states, attention_mask, diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py index 86c0cbe5e2d6cb..3e37a4a504b0b4 100644 --- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py @@ -35,7 +35,6 @@ XVectorOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -524,7 +523,7 @@ def custom_forward(*inputs): return custom_forward - hidden_states = torch_custom_checkpointing( + hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(conv_layer), hidden_states, ) @@ -917,7 +916,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer), hidden_states, attention_mask, diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py index 35dc46bac1f942..e4072d93724fd8 100755 --- a/src/transformers/models/wavlm/modeling_wavlm.py +++ b/src/transformers/models/wavlm/modeling_wavlm.py @@ -36,7 +36,6 @@ XVectorOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_wavlm import WavLMConfig @@ -362,7 +361,7 @@ def custom_forward(*inputs): return custom_forward - hidden_states = torch_custom_checkpointing( + hidden_states = torch.utils.checkpoint.checkpoint( create_custom_forward(conv_layer), hidden_states, ) @@ -721,7 +720,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer), hidden_states, attention_mask, @@ -812,7 +811,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer), hidden_states, attention_mask, diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index ef6a98b6c5ba87..42fda344f6107f 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -34,7 +34,6 @@ SequenceClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( add_start_docstrings, add_start_docstrings_to_model_forward, @@ -929,7 +928,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, None, @@ -1161,7 +1160,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index be6b2818900330..8db4ee0fd19480 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -26,7 +26,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( ModelOutput, add_start_docstrings, @@ -709,7 +708,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, @@ -956,7 +955,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py index 61b51d51fcbcfb..4a72b785a02412 100755 --- a/src/transformers/models/xglm/modeling_xglm.py +++ b/src/transformers/models/xglm/modeling_xglm.py @@ -27,7 +27,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_xglm import XGLMConfig @@ -684,7 +683,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py index fd90086672cf40..2d14bfb6a7b548 100644 --- a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py +++ b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py @@ -29,7 +29,6 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import ( ModelOutput, add_start_docstrings, @@ -1357,7 +1356,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, extended_attention_mask, @@ -1601,7 +1600,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, extended_attention_mask, diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py index e00574239c9296..ae8d51a3f8eb63 100644 --- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py @@ -35,12 +35,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -521,7 +516,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py index 71f8de5a7277b9..fb86717e1d7fa4 100644 --- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py @@ -34,12 +34,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -509,7 +504,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/models/xmod/modeling_xmod.py b/src/transformers/models/xmod/modeling_xmod.py index 44e50bed3b21ca..d99b77fedda38f 100644 --- a/src/transformers/models/xmod/modeling_xmod.py +++ b/src/transformers/models/xmod/modeling_xmod.py @@ -34,12 +34,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_xmod import XmodConfig @@ -583,7 +578,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, lang_ids, diff --git a/src/transformers/models/yolos/modeling_yolos.py b/src/transformers/models/yolos/modeling_yolos.py index 4b4aa012416780..e3cb02ceae6ec0 100755 --- a/src/transformers/models/yolos/modeling_yolos.py +++ b/src/transformers/models/yolos/modeling_yolos.py @@ -27,7 +27,7 @@ from ...activations import ACT2FN from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer, torch_custom_checkpointing +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -499,7 +499,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, layer_head_mask, diff --git a/src/transformers/models/yoso/modeling_yoso.py b/src/transformers/models/yoso/modeling_yoso.py index 1b1e6b13add8a3..8c2ff9fa4e0753 100644 --- a/src/transformers/models/yoso/modeling_yoso.py +++ b/src/transformers/models/yoso/modeling_yoso.py @@ -34,12 +34,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import ( - apply_chunking_to_forward, - find_pruneable_heads_and_indices, - prune_linear_layer, - torch_custom_checkpointing, -) +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_yoso import YosoConfig @@ -571,7 +566,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py index 520eeb89393aea..4723c43035e67c 100644 --- a/src/transformers/pytorch_utils.py +++ b/src/transformers/pytorch_utils.py @@ -285,18 +285,3 @@ def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]: non-overlapping lifetimes may have the same id. """ return tensor.device, storage_ptr(tensor), storage_size(tensor) - - -def torch_custom_checkpointing(*args): - r""" - A correct usage of `torch.utils.checkpoint.checkpoint` as the default call leads to silent bugs that leads to the - gradients of the last layers not being updated. For more in depth detail of the issue, please have a look at: - https://github.com/huggingface/transformers/pull/24247 - """ - kwargs = {} - if "use_reentrant" in list(inspect.signature(torch.utils.checkpoint.checkpoint).parameters): - kwargs["use_reentrant"] = False - return torch.utils.checkpoint.checkpoint( - *args, - **kwargs, - ) diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py index c5d141b1f4b839..4899e195986fd2 100755 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py @@ -43,7 +43,6 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel, SequenceSummary -from ...pytorch_utils import torch_custom_checkpointing from ...pytorch_utils import ( apply_chunking_to_forward, find_pruneable_heads_and_indices, @@ -551,7 +550,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, attention_mask, @@ -1586,7 +1585,6 @@ def forward( CausalLMOutputWithCrossAttentions ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_custom_checkpointing from ...utils import logging from .configuration_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Config @@ -2320,7 +2318,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(encoder_layer), hidden_states, attention_mask, @@ -2559,7 +2557,7 @@ def custom_forward(*inputs): return custom_forward - layer_outputs = torch_custom_checkpointing( + layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(decoder_layer), hidden_states, attention_mask, diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py index c8ac69840f77f7..2357c20e213a8d 100644 --- a/tests/models/align/test_modeling_align.py +++ b/tests/models/align/test_modeling_align.py @@ -352,12 +352,6 @@ def test_training(self): def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - @unittest.skip(reason="ALIGN does not use inputs_embeds") def test_inputs_embeds(self): pass diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index 266e0c47b6bba6..28213de84df63c 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -186,12 +186,6 @@ def test_training(self): def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - @unittest.skip(reason="AltCLIPVisionModel has no base class and is not available in MODEL_MAPPING") def test_save_load_fast_init_from_base(self): pass diff --git a/tests/models/autoformer/test_modeling_autoformer.py b/tests/models/autoformer/test_modeling_autoformer.py index ad006d9d0794ce..9f0434689c4be8 100644 --- a/tests/models/autoformer/test_modeling_autoformer.py +++ b/tests/models/autoformer/test_modeling_autoformer.py @@ -238,12 +238,6 @@ def test_encoder_decoder_model_standalone(self): def test_resize_tokens_embeddings(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - # # Input is 'static_categorical_features' not 'input_ids' def test_model_main_input_name(self): model_signature = inspect.signature(getattr(AutoformerModel, "forward")) diff --git a/tests/models/beit/test_modeling_beit.py b/tests/models/beit/test_modeling_beit.py index 149820023a6940..f9aa7339f7e0c2 100644 --- a/tests/models/beit/test_modeling_beit.py +++ b/tests/models/beit/test_modeling_beit.py @@ -227,12 +227,6 @@ def test_inputs_embeds(self): def test_multi_gpu_data_parallel_forward(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - def test_model_common_attributes(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/big_bird/test_modeling_big_bird.py b/tests/models/big_bird/test_modeling_big_bird.py index 45bff430bfdb8e..f86c6d0ac70ab8 100644 --- a/tests/models/big_bird/test_modeling_big_bird.py +++ b/tests/models/big_bird/test_modeling_big_bird.py @@ -609,12 +609,6 @@ def test_for_change_to_full_attn(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_change_to_full_attn(*config_and_inputs) - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - # overwrite from common in order to skip the check on `attentions` def check_pt_flax_outputs(self, fx_outputs, pt_outputs, model_class, tol=1e-5, name="outputs", attributes=None): # `bigbird_block_sparse_attention` in `FlaxBigBird` returns `attention_probs = None`, while in PyTorch version, diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py index a34efc026474d9..7d9c6b5ba58b05 100644 --- a/tests/models/blip/test_modeling_blip.py +++ b/tests/models/blip/test_modeling_blip.py @@ -789,12 +789,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_model_common_attributes(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/canine/test_modeling_canine.py b/tests/models/canine/test_modeling_canine.py index 6e6d7ce3836a28..d612a02bf47c67 100644 --- a/tests/models/canine/test_modeling_canine.py +++ b/tests/models/canine/test_modeling_canine.py @@ -499,12 +499,6 @@ def test_inputs_embeds(self): # ViT does not use inputs_embeds pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - @unittest.skip("CANINE does not have a get_input_embeddings() method.") def test_model_common_attributes(self): pass diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py index cf2668f4d8b5ca..57f532da863515 100644 --- a/tests/models/chinese_clip/test_modeling_chinese_clip.py +++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py @@ -395,12 +395,6 @@ def test_training(self): def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - @unittest.skip(reason="ChineseCLIPTextModel has no base class and is not available in MODEL_MAPPING") def test_save_load_fast_init_from_base(self): pass @@ -475,12 +469,6 @@ def test_save_load_fast_init_from_base(self): def test_save_load_fast_init_to_base(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - @slow def test_model_from_pretrained(self): for model_name in CHINESE_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index 82592d8452f5f8..d16241ab2f22a0 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -227,12 +227,6 @@ def test_training(self): def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - @unittest.skip(reason="CLIPVisionModel has no base class and is not available in MODEL_MAPPING") def test_save_load_fast_init_from_base(self): pass diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index 387a2e1c8f3454..b54861d8d8d045 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -202,12 +202,6 @@ def test_training(self): def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - @unittest.skip(reason="CLIPSegVisionModel has no base class and is not available in MODEL_MAPPING") def test_save_load_fast_init_from_base(self): pass @@ -454,12 +448,6 @@ def test_model_for_image_segmentation(self): def test_hidden_states_output(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - @unittest.skip(reason="Inputs_embeds is tested in individual model tests") def test_inputs_embeds(self): pass diff --git a/tests/models/data2vec/test_modeling_data2vec_vision.py b/tests/models/data2vec/test_modeling_data2vec_vision.py index 90786de24978f2..b4c391fea17e64 100644 --- a/tests/models/data2vec/test_modeling_data2vec_vision.py +++ b/tests/models/data2vec/test_modeling_data2vec_vision.py @@ -310,12 +310,6 @@ def test_for_image_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_image_classification(*config_and_inputs) - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - @slow def test_model_from_pretrained(self): for model_name in DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py index 5889653991cac2..76790ee795026e 100644 --- a/tests/models/dpt/test_modeling_dpt.py +++ b/tests/models/dpt/test_modeling_dpt.py @@ -182,12 +182,6 @@ def test_config(self): def test_inputs_embeds(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - def test_model_common_attributes(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/dpt/test_modeling_dpt_hybrid.py b/tests/models/dpt/test_modeling_dpt_hybrid.py index 04ba8c0289bed0..6d4a75c80da120 100644 --- a/tests/models/dpt/test_modeling_dpt_hybrid.py +++ b/tests/models/dpt/test_modeling_dpt_hybrid.py @@ -196,12 +196,6 @@ def test_config(self): def test_inputs_embeds(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - def test_model_common_attributes(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py index b6d71f33a684fd..2544b7ee93f6ca 100644 --- a/tests/models/flava/test_modeling_flava.py +++ b/tests/models/flava/test_modeling_flava.py @@ -185,12 +185,6 @@ def test_inputs_embeds(self): # FLAVA does not use inputs_embeds pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - def test_model_common_attributes(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() @@ -468,12 +462,6 @@ def test_inputs_embeds(self): # FLAVA does not use inputs_embeds pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - # skip this test as FlavaTextModel has no base class and is # not available in MODEL_MAPPING def test_save_load_fast_init_from_base(self): @@ -636,12 +624,6 @@ def test_save_load_fast_init_from_base(self): def test_save_load_fast_init_to_base(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - @slow def test_model_from_pretrained(self): for model_name in FLAVA_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: @@ -749,12 +731,6 @@ def test_save_load_fast_init_from_base(self): def test_save_load_fast_init_to_base(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - @slow def test_model_from_pretrained(self): for model_name in FLAVA_CODEBOOK_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: @@ -1180,12 +1156,6 @@ class FlavaForPreTrainingTest(FlavaModelTest): class_for_tester = FlavaForPreTrainingTester test_torchscript = False - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - # We will verify our results on an image of cute cats def prepare_img(): diff --git a/tests/models/fnet/test_modeling_fnet.py b/tests/models/fnet/test_modeling_fnet.py index 96821842736522..e7e592d5b62ff5 100644 --- a/tests/models/fnet/test_modeling_fnet.py +++ b/tests/models/fnet/test_modeling_fnet.py @@ -444,12 +444,6 @@ def test_for_token_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_token_classification(*config_and_inputs) - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - @slow def test_model_from_pretrained(self): for model_name in FNET_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py index 620bb30b265713..65542b49549742 100644 --- a/tests/models/gpt2/test_modeling_gpt2.py +++ b/tests/models/gpt2/test_modeling_gpt2.py @@ -562,12 +562,6 @@ def test_gpt2_weight_initialization(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_gpt2_weight_initialization(*config_and_inputs) - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - @slow def test_batch_generation(self): model = GPT2LMHeadModel.from_pretrained("gpt2") diff --git a/tests/models/graphormer/test_modeling_graphormer.py b/tests/models/graphormer/test_modeling_graphormer.py index f1c63729e00063..e874ebf0f44a2b 100644 --- a/tests/models/graphormer/test_modeling_graphormer.py +++ b/tests/models/graphormer/test_modeling_graphormer.py @@ -356,12 +356,6 @@ def test_inputs_embeds(self): def test_feed_forward_chunking(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - @unittest.skip(reason="Graphormer does not share input and output embeddings") def test_model_common_attributes(self): pass diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py index 1f4ea02f8d2002..27d83f3eb8c1e9 100644 --- a/tests/models/imagegpt/test_modeling_imagegpt.py +++ b/tests/models/imagegpt/test_modeling_imagegpt.py @@ -304,12 +304,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - def test_imagegpt_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_imagegpt_model(*config_and_inputs) diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py index 2202d62242cab9..f3c8539d845049 100644 --- a/tests/models/informer/test_modeling_informer.py +++ b/tests/models/informer/test_modeling_informer.py @@ -216,12 +216,6 @@ def test_encoder_decoder_model_standalone(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common() self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs) - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) diff --git a/tests/models/layoutlm/test_modeling_layoutlm.py b/tests/models/layoutlm/test_modeling_layoutlm.py index b88d0c4b50d87a..0535fbf4e1f4c8 100644 --- a/tests/models/layoutlm/test_modeling_layoutlm.py +++ b/tests/models/layoutlm/test_modeling_layoutlm.py @@ -279,12 +279,6 @@ def test_for_question_answering(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_question_answering(*config_and_inputs) - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - def prepare_layoutlm_batch_inputs(): # Here we prepare a batch of 2 sequences to test a LayoutLM forward pass on: diff --git a/tests/models/lilt/test_modeling_lilt.py b/tests/models/lilt/test_modeling_lilt.py index 4032504b8b2587..1bb92300c3db91 100644 --- a/tests/models/lilt/test_modeling_lilt.py +++ b/tests/models/lilt/test_modeling_lilt.py @@ -275,12 +275,6 @@ def test_for_question_answering(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_question_answering(*config_and_inputs) - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - @slow def test_model_from_pretrained(self): for model_name in LILT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: diff --git a/tests/models/luke/test_modeling_luke.py b/tests/models/luke/test_modeling_luke.py index 4e1ef3d173b47c..35bdb6b6d5fa6a 100644 --- a/tests/models/luke/test_modeling_luke.py +++ b/tests/models/luke/test_modeling_luke.py @@ -697,12 +697,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - @slow def test_model_from_pretrained(self): for model_name in LUKE_PRETRAINED_MODEL_ARCHIVE_LIST: diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py index 933383d2929ad9..6cbcd55d3f7687 100644 --- a/tests/models/marian/test_modeling_marian.py +++ b/tests/models/marian/test_modeling_marian.py @@ -263,12 +263,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - def test_save_load_strict(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs() for model_class in self.all_model_classes: diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py index 83fb86ba0e9319..acf078ffe80075 100644 --- a/tests/models/owlvit/test_modeling_owlvit.py +++ b/tests/models/owlvit/test_modeling_owlvit.py @@ -155,12 +155,6 @@ def test_config(self): def test_inputs_embeds(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - def test_model_common_attributes(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() @@ -639,12 +633,6 @@ def test_training(self): def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - def _create_and_check_torchscript(self, config, inputs_dict): if not self.test_torchscript: return diff --git a/tests/models/pegasus/test_modeling_pegasus.py b/tests/models/pegasus/test_modeling_pegasus.py index 1f409d1b004bfe..bde7477f945040 100644 --- a/tests/models/pegasus/test_modeling_pegasus.py +++ b/tests/models/pegasus/test_modeling_pegasus.py @@ -280,12 +280,6 @@ def test_encoder_decoder_model_standalone(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common() self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs) - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - def test_generate_fp16(self): config, input_dict = self.model_tester.prepare_config_and_inputs() input_ids = input_dict["input_ids"] diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py index 1eba4cb10c287f..8ec023676d6327 100644 --- a/tests/models/pix2struct/test_modeling_pix2struct.py +++ b/tests/models/pix2struct/test_modeling_pix2struct.py @@ -332,12 +332,6 @@ def test_model(self): def test_training(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - @unittest.skip(reason="Training is tested directly on `Pix2StructTextImageModelTest`") def test_training_gradient_checkpointing(self): pass diff --git a/tests/models/regnet/test_modeling_regnet.py b/tests/models/regnet/test_modeling_regnet.py index 9b260845287b10..e7c33699fda7db 100644 --- a/tests/models/regnet/test_modeling_regnet.py +++ b/tests/models/regnet/test_modeling_regnet.py @@ -161,12 +161,6 @@ def test_inputs_embeds(self): def test_model_common_attributes(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/roformer/test_modeling_roformer.py b/tests/models/roformer/test_modeling_roformer.py index 6d54b7c1286e50..357e126a047a4d 100644 --- a/tests/models/roformer/test_modeling_roformer.py +++ b/tests/models/roformer/test_modeling_roformer.py @@ -452,12 +452,6 @@ def test_model_as_decoder(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - def test_model_as_decoder_with_default_input_mask(self): # This regression test was failing with PyTorch < 1.3 ( diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py index 8a4772138647c8..a0f39a40135577 100644 --- a/tests/models/sam/test_modeling_sam.py +++ b/tests/models/sam/test_modeling_sam.py @@ -421,12 +421,6 @@ def test_training(self): def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - @unittest.skip(reason="SamModel has no base class and is not available in MODEL_MAPPING") def test_save_load_fast_init_from_base(self): pass diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py index 1524ce24d26273..16ad704fd51043 100644 --- a/tests/models/speech_to_text/test_modeling_speech_to_text.py +++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py @@ -324,12 +324,6 @@ def test_training(self): def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - def test_generate_fp16(self): config, input_dict = self.model_tester.prepare_config_and_inputs() input_features = input_dict["input_features"] diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py index 4ff4554fbc3506..f8730d899329ff 100644 --- a/tests/models/switch_transformers/test_modeling_switch_transformers.py +++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py @@ -613,12 +613,6 @@ def test_decoder_model_past_with_attn_mask(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_decoder_model_attention_mask_past(*config_and_inputs) - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - @slow def test_beam_sample_generate_dict_output(self): r""" diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py index 44962267feea64..42319a1dd0a242 100644 --- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py +++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py @@ -200,12 +200,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - def test_save_load_strict(self): config, _ = self.model_tester.prepare_config_and_inputs() for model_class in self.all_model_classes: diff --git a/tests/models/van/test_modeling_van.py b/tests/models/van/test_modeling_van.py index 7ec941dbc8851f..49df30a828a61e 100644 --- a/tests/models/van/test_modeling_van.py +++ b/tests/models/van/test_modeling_van.py @@ -243,12 +243,6 @@ def test_model_from_pretrained(self): model = VanModel.from_pretrained(model_name) self.assertIsNotNone(model) - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - # We will verify our results on an image of cute cats def prepare_img(): diff --git a/tests/models/vilt/test_modeling_vilt.py b/tests/models/vilt/test_modeling_vilt.py index 17447acf680d52..772091d5b976d5 100644 --- a/tests/models/vilt/test_modeling_vilt.py +++ b/tests/models/vilt/test_modeling_vilt.py @@ -340,12 +340,6 @@ def test_determinism(self): def test_model_outputs_equivalence(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - def test_attention_outputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config.return_dict = True diff --git a/tests/models/visual_bert/test_modeling_visual_bert.py b/tests/models/visual_bert/test_modeling_visual_bert.py index 5dae4ebe1f9439..cf48fd7ffbec31 100644 --- a/tests/models/visual_bert/test_modeling_visual_bert.py +++ b/tests/models/visual_bert/test_modeling_visual_bert.py @@ -549,12 +549,6 @@ def test_model_for_flickr(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_flickr() self.model_tester.create_and_check_for_flickr(*config_and_inputs) - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - @slow def test_model_from_pretrained(self): for model_name in VISUAL_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py index 77c36bef8babe2..c58e2e94802e6b 100644 --- a/tests/models/vit_mae/test_modeling_vit_mae.py +++ b/tests/models/vit_mae/test_modeling_vit_mae.py @@ -208,12 +208,6 @@ def test_for_pretraining(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_pretraining(*config_and_inputs) - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - # overwrite from common since ViTMAEForPretraining has random masking, we need to fix the noise # to generate masks during test def check_pt_tf_models(self, tf_model, pt_model, pt_inputs_dict): diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index 7fd65d871dbfc3..2efece44caebeb 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -202,12 +202,6 @@ def test_save_load_fast_init_from_base(self): def test_save_load_fast_init_to_base(self): pass - @unittest.skip( - reason="The model does not support GC + autocast + fp16: https://github.com/huggingface/transformers/pull/24247" - ) - def test_training_gradient_checkpointing_autocast(self): - pass - @slow def test_model_from_pretrained(self): for model_name in XCLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 7c02141f057e94..07a8b16bfef758 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -12,6 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import collections import copy import gc @@ -548,41 +549,6 @@ def test_training_gradient_checkpointing(self): loss = model(**inputs).loss loss.backward() - @slow - @require_torch_gpu - def test_training_gradient_checkpointing_autocast(self): - if not self.model_tester.is_training: - return - - for model_class in self.all_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - config.use_cache = False - config.return_dict = True - - if ( - model_class.__name__ - in [*get_values(MODEL_MAPPING_NAMES), *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES)] - or not model_class.supports_gradient_checkpointing - ): - continue - model = model_class(config) - model.to(torch_device) - - optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) - - model.gradient_checkpointing_enable() - model.train() - inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) - with torch.cuda.amp.autocast(True, dtype=torch.float16): - output = model(**inputs)[0] - loss = output.mean() - - loss.backward() - optimizer.step() - - for n, param in model.named_parameters(): - self.assertTrue(param.grad is not None, f"None gradient in param {n}") - def test_attention_outputs(self): if not self.has_attentions: self.skipTest(reason="Model does not output attentions")