huggingface · ydshieh · Jan 16, 2025 · Jan 16, 2025 · Jan 16, 2025 · Jan 16, 2025
@@ -190,9 +190,6 @@ workflows:
                 - equal: [<<pipeline.project.git_url>>, https://github.com/huggingface/transformers]
                 - not: <<pipeline.parameters.nightly>>
         jobs:
-            - check_circleci_user
-            - check_code_quality
-            - check_repository_consistency
             - fetch_tests
 
     setup_and_quality_2:

@@ -127,7 +127,7 @@ def to_dict(self):
         timeout_cmd = f"timeout {self.command_timeout} " if self.command_timeout else ""
         marker_cmd = f"-m '{self.marker}'" if self.marker is not None else ""
         additional_flags = f" -p no:warning -o junit_family=xunit1 --junitxml=test-results/junit.xml"
-        parallel = f' << pipeline.parameters.{self.job_name}_parallelism >> '
+        parallel = 1
         steps = [
             "checkout",
             {"attach_workspace": {"at": "test_preparation"}},
@@ -152,9 +152,11 @@ def to_dict(self):
                     "command": f"TESTS=$(circleci tests split  --split-by=timings {self.job_name}_test_list.txt) && echo $TESTS > splitted_tests.txt && echo $TESTS | tr ' ' '\n'" if self.parallelism else f"awk '{{printf \"%s \", $0}}' {self.job_name}_test_list.txt > splitted_tests.txt"
                     }
             },
+            {"run": "pip install -U pytest"},
+            {"run": "pip install pytest-flakefinder"},
             {"run": {
                 "name": "Run tests",
-                "command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {additional_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"}
+                "command": f"({timeout_cmd} python3 -m pytest @pytest.txt | tee tests_output.txt)"}
             },
             {"run": {"name": "Expand to show skipped tests", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip"}},
             {"run": {"name": "Failed tests: show reasons",   "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail"}},
@@ -198,7 +200,7 @@ def job_name(self):
     "torch",
     docker_image=[{"image": "huggingface/transformers-torch-light"}],
     marker="not generate",
-    parallelism=6,
+    parallelism=1,
 )
 
 generate_job = CircleCIJob(
@@ -358,7 +360,8 @@ def job_name(self):
 PIPELINE_TESTS = [pipelines_torch_job, pipelines_tf_job]
 REPO_UTIL_TESTS = [repo_utils_job]
 DOC_TESTS = [doc_test_job]
-ALL_TESTS = REGULAR_TESTS + EXAMPLES_TESTS + PIPELINE_TESTS + REPO_UTIL_TESTS + DOC_TESTS + [custom_tokenizers_job] + [exotic_models_job]  # fmt: skip
+# ALL_TESTS = REGULAR_TESTS + EXAMPLES_TESTS + PIPELINE_TESTS + REPO_UTIL_TESTS + DOC_TESTS + [custom_tokenizers_job] + [exotic_models_job]  # fmt: skip
+ALL_TESTS = [torch_job]
 
 
 def create_circleci_config(folder=None):

@@ -0,0 +1,21 @@
+tests/models/wav2vec2/test_modeling_wav2vec2.py::Wav2Vec2RobustModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in Wav2Vec2ForPreTraining for key=projected_quantized_states.
+tests/models/unispeech/test_modeling_unispeech.py::UniSpeechRobustModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in UniSpeechForPreTraining for key=projected_quantized_states.
+tests/models/unispeech/test_modeling_unispeech.py::UniSpeechRobustModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in UniSpeechForPreTraining for key=projected_quantized_states.
+
+tests/models/groupvit/test_modeling_groupvit.py::GroupViTModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in GroupViTModel for key=vision_model_output.
+tests/models/groupvit/test_modeling_groupvit.py::GroupViTModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in GroupViTModel for key=logits_per_image.
+tests/models/groupvit/test_modeling_groupvit.py::GroupViTModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in GroupViTModel for key=logits_per_image.
+tests/models/groupvit/test_modeling_groupvit.py::GroupViTModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in GroupViTModel for key=vision_model_output.
+
+
+tests/models/superpoint/test_modeling_superpoint.py::SuperPointModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in SuperPointForKeypointDetection for key=keypoints.
+tests/models/oneformer/test_modeling_oneformer.py::OneFormerModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in OneFormerForUniversalSegmentation for key=transformer_decoder_object_queries.
+
+
+
+
+
+tests/models/flaubert/test_modeling_flaubert.py::FlaubertModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in FlaubertForQuestionAnswering for key=end_top_index.
+tests/models/xlm/test_modeling_xlm.py::XLMModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in XLMForQuestionAnswering for key=end_top_index.
+tests/models/xlnet/test_modeling_xlnet.py::XLNetModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in XLNetForQuestionAnswering for key=end_top_index.
+tests/models/xlnet/test_modeling_xlnet.py::XLNetModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in XLNetForQuestionAnswering for key=end_top_index.
@@ -1194,6 +1194,7 @@ def __init__(self, config):
 
         # can be decayed for training
         self.temperature = 2
+        self.outputs = []
 
     @staticmethod
     def _compute_perplexity(probs, mask=None):
@@ -1210,9 +1211,16 @@ def _compute_perplexity(probs, mask=None):
     def forward(self, hidden_states, mask_time_indices=None):
         batch_size, sequence_length, hidden_size = hidden_states.shape
 
+        self.outputs.append(hidden_states)
+        self.outputs.append(mask_time_indices)
+
         # project to codevector dim
         hidden_states = self.weight_proj(hidden_states)
+        self.outputs.append(hidden_states)
+
+
         hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)
+        self.outputs.append(hidden_states)
 
         if self.training:
             # sample code vector probs via gumbel in differentiateable way
@@ -1229,18 +1237,26 @@ def forward(self, hidden_states, mask_time_indices=None):
             # take argmax in non-differentiable way
             # comptute hard codevector distribution (one hot)
             codevector_idx = hidden_states.argmax(dim=-1)
+            self.outputs.append(codevector_idx)
             codevector_probs = hidden_states.new_zeros(hidden_states.shape).scatter_(
                 -1, codevector_idx.view(-1, 1), 1.0
             )
+            self.outputs.append(codevector_probs)
             codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
+            self.outputs.append(codevector_probs)
 
             perplexity = self._compute_perplexity(codevector_probs, mask_time_indices)
+            self.outputs.append(perplexity)
 
         codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
+        self.outputs.append(codevector_probs)
         # use probs to retrieve codevectors
         codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
+        self.outputs.append(codevectors_per_group)
         codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
+        self.outputs.append(codevectors)
         codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
+        self.outputs.append(codevectors)
 
         return codevectors, perplexity
 
@@ -1858,6 +1874,8 @@ def __init__(self, config: Wav2Vec2Config):
         # Initialize weights and apply final processing
         self.post_init()
 
+        self.outputs = []
+
     def set_gumbel_temperature(self, temperature: int):
         """
         Set the Gumbel softmax temperature to a given value. Only necessary for training
@@ -1989,22 +2007,29 @@ def forward(
 
         # 1. project all transformed features (including masked) to final vq dim
         transformer_features = self.project_hid(outputs[0])
+        self.outputs.append(transformer_features)
 
         # 2. quantize all (unmasked) extracted features and project to final vq dim
         extract_features = self.dropout_features(outputs[1])
+        self.outputs.append(extract_features)
 
         if attention_mask is not None:
             # compute reduced attention_mask correponding to feature vectors
             attention_mask = self._get_feature_vector_attention_mask(
                 extract_features.shape[1], attention_mask, add_adapter=False
             )
+            self.outputs.append(attention_mask)
 
         quantized_features, codevector_perplexity = self.quantizer(
             extract_features, mask_time_indices=mask_time_indices
         )
+        self.outputs.append(mask_time_indices)
+        self.outputs.append(quantized_features)
 
         quantized_features = quantized_features.to(self.project_q.weight.dtype)
+        self.outputs.append(quantized_features)
         quantized_features = self.project_q(quantized_features)
+        self.outputs.append(quantized_features)
 
         loss = contrastive_loss = diversity_loss = None
         if sampled_negative_indices is not None:

@@ -1476,7 +1476,7 @@ def set_config_for_less_flaky_test(config):
 
 def set_model_for_less_flaky_test(model):
     # Another way to make sure norm layers have desired epsilon. (Some models don't set it from its config.)
-    target_names = ("LayerNorm", "GroupNorm", "BatchNorm", "RMSNorm", "BatchNorm2d", "BatchNorm1d")
+    target_names = ("LayerNorm", "GroupNorm", "BatchNorm", "RMSNorm", "BatchNorm2d", "BatchNorm1d", "BitGroupNormActivation", "WeightStandardizedConv2d")
     target_attrs = ["eps", "epsilon", "variance_epsilon"]
     if is_torch_available() and isinstance(model, torch.nn.Module):
         for module in model.modules():

@@ -217,6 +217,10 @@ def setUp(self):
         self.model_tester = AutoformerModelTester(self)
         self.config_tester = ConfigTester(self, config_class=AutoformerConfig, has_text_modality=False)
 
+    @is_flaky(description="The computation of `tmp_delay` in `AutoformerAttention.forward` seems wrong, see PR #12345. Also `topk` is used to compute indices which is not stable.")
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
     def test_config(self):
         self.config_tester.run_common_tests()
 

@@ -146,6 +146,11 @@ def test_model_forward(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_forward(*config_and_inputs)
 
+    # TODO (ydshieh): Although we have a potential cause, it's still strange that this test fails all the time with large differences
+    @unittest.skip(reason="Might be caused by `indices` computed with `max()` in `decode_latents`")
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 

@@ -304,7 +304,6 @@ def test_raise_readout_type(self):
         with self.assertRaises(ValueError):
             _ = DPTForDepthEstimation(config)
 
-    @is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
     def test_batching_equivalence(self):
         super().test_batching_equivalence()
 

@@ -17,7 +17,7 @@
 import unittest
 
 from transformers import EsmConfig, is_torch_available
-from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device
+from transformers.testing_utils import TestCasePlus, is_flaky, require_torch, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
@@ -184,6 +184,10 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @is_flaky(description="Unknown flaky reason ...")
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
     @unittest.skip(reason="Does not support attention outputs")
     def test_attention_outputs(self):
         pass

@@ -17,7 +17,7 @@
 import unittest
 
 from transformers import FlaubertConfig, is_sacremoses_available, is_torch_available
-from transformers.testing_utils import require_torch, require_torch_accelerator, slow, torch_device
+from transformers.testing_utils import is_flaky, require_torch, require_torch_accelerator, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
@@ -439,6 +439,10 @@ def test_flaubert_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_flaubert_model(*config_and_inputs)
 
+    @is_flaky(description="The indices computed with `topk()` in `SQuADHead` (of `FlaubertForQuestionAnswering`) is not stable.")
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
     # Copied from tests/models/distilbert/test_modeling_distilbert.py with Distilbert->Flaubert
     def test_flaubert_model_with_sinusoidal_encodings(self):
         config = FlaubertConfig(sinusoidal_embeddings=True)

@@ -24,7 +24,7 @@
 import requests
 
 from transformers import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig
-from transformers.testing_utils import is_pt_tf_cross_test, require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import is_pt_tf_cross_test, require_torch, require_vision, slow, torch_device, is_flaky
 from transformers.utils import is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -162,6 +162,10 @@ def test_config(self):
     def test_inputs_embeds(self):
         pass
 
+    @is_flaky(description="The `index` computed with `max()` in `hard_softmax` is not stable.")
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
     @is_pt_tf_cross_test
     def test_pt_tf_model_equivalence(self):
         import tensorflow as tf
@@ -571,6 +575,10 @@ def test_model(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
+    @is_flaky(description="The `index` computed with `max()` in `hard_softmax` is not stable.")
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
     @unittest.skip(reason="hidden_states are tested in individual model tests")
     def test_hidden_states_output(self):
         pass

@@ -734,7 +734,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self):
     def test_sdpa_can_compile_dynamic(self):
         pass
 
-    @is_flaky()
     def test_batching_equivalence(self):
         super().test_batching_equivalence()
 

@@ -214,7 +214,6 @@ def test_model_from_pretrained(self):
         model = MobileNetV1Model.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @is_flaky(description="is_flaky https://github.com/huggingface/transformers/pull/31258")
     def test_batching_equivalence(self):
         super().test_batching_equivalence()
 

@@ -269,7 +269,6 @@ def test_model_from_pretrained(self):
         model = MobileNetV2Model.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
     def test_batching_equivalence(self):
         super().test_batching_equivalence()
 

@@ -274,7 +274,6 @@ def test_model_from_pretrained(self):
         model = MobileViTModel.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
-    @is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
     def test_batching_equivalence(self):
         super().test_batching_equivalence()
 

@@ -16,7 +16,7 @@
 from typing import List
 
 from transformers.models.superpoint.configuration_superpoint import SuperPointConfig
-from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
 from ...test_configuration_common import ConfigTester
@@ -135,6 +135,10 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
+    @is_flaky(description="The `indices` computed with `topk()` in `top_k_keypoints` is not stable.")
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
     @unittest.skip(reason="SuperPointForKeypointDetection does not use inputs_embeds")
     def test_inputs_embeds(self):
         pass

@@ -18,7 +18,7 @@
 import unittest
 
 from transformers import AutoBackbone
-from transformers.testing_utils import require_timm, require_torch, torch_device
+from transformers.testing_utils import is_flaky, require_timm, require_torch, torch_device
 from transformers.utils.import_utils import is_torch_available
 
 from ...test_backbone_common import BackboneTesterMixin
@@ -115,6 +115,10 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
+    @is_flaky(description="`TimmBackbone` has no `_init_weights`. Timm's way of weight init. seems to give larger magnitude in the intermediate values during `forward`.")
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
     def test_timm_transformer_backbone_equivalence(self):
         timm_checkpoint = "resnet18"
         transformers_checkpoint = "microsoft/resnet-18"

@@ -22,7 +22,7 @@
 from datasets import load_dataset
 
 from transformers import UniSpeechConfig, is_torch_available
-from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device
+from transformers.testing_utils import is_flaky, require_soundfile, require_torch, slow, torch_device
 
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
@@ -329,6 +329,10 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @is_flaky(description="The `codevector_idx` computed with `argmax()` in `UniSpeechGumbelVectorQuantizer.forward` is not stable.")
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
     def test_batched_inference(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_batch_inference(*config_and_inputs)

@@ -30,6 +30,7 @@
 from transformers.testing_utils import (
     CaptureLogger,
     cleanup,
+    is_flaky,
     is_pt_flax_cross_test,
     is_pyctcdecode_available,
     is_torchaudio_available,
@@ -863,6 +864,10 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    @is_flaky(description="The `codevector_idx` computed with `argmax()` in `Wav2Vec2GumbelVectorQuantizer.forward` is not stable.")
+    def test_batching_equivalence(self):
+        super().test_batching_equivalence()
+
     def test_model_with_adapter(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)