Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Byebye test_batching_equivalence's flakiness #35729

Draft
wants to merge 20 commits into
base: main
Choose a base branch
from
Draft
3 changes: 0 additions & 3 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -190,9 +190,6 @@ workflows:
- equal: [<<pipeline.project.git_url>>, https://github.com/huggingface/transformers]
- not: <<pipeline.parameters.nightly>>
jobs:
- check_circleci_user
- check_code_quality
- check_repository_consistency
- fetch_tests

setup_and_quality_2:
Expand Down
11 changes: 7 additions & 4 deletions .circleci/create_circleci_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def to_dict(self):
timeout_cmd = f"timeout {self.command_timeout} " if self.command_timeout else ""
marker_cmd = f"-m '{self.marker}'" if self.marker is not None else ""
additional_flags = f" -p no:warning -o junit_family=xunit1 --junitxml=test-results/junit.xml"
parallel = f' << pipeline.parameters.{self.job_name}_parallelism >> '
parallel = 1
steps = [
"checkout",
{"attach_workspace": {"at": "test_preparation"}},
Expand All @@ -152,9 +152,11 @@ def to_dict(self):
"command": f"TESTS=$(circleci tests split --split-by=timings {self.job_name}_test_list.txt) && echo $TESTS > splitted_tests.txt && echo $TESTS | tr ' ' '\n'" if self.parallelism else f"awk '{{printf \"%s \", $0}}' {self.job_name}_test_list.txt > splitted_tests.txt"
}
},
{"run": "pip install -U pytest"},
{"run": "pip install pytest-flakefinder"},
{"run": {
"name": "Run tests",
"command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {additional_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"}
"command": f"({timeout_cmd} python3 -m pytest @pytest.txt | tee tests_output.txt)"}
},
{"run": {"name": "Expand to show skipped tests", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip"}},
{"run": {"name": "Failed tests: show reasons", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail"}},
Expand Down Expand Up @@ -198,7 +200,7 @@ def job_name(self):
"torch",
docker_image=[{"image": "huggingface/transformers-torch-light"}],
marker="not generate",
parallelism=6,
parallelism=1,
)

generate_job = CircleCIJob(
Expand Down Expand Up @@ -358,7 +360,8 @@ def job_name(self):
PIPELINE_TESTS = [pipelines_torch_job, pipelines_tf_job]
REPO_UTIL_TESTS = [repo_utils_job]
DOC_TESTS = [doc_test_job]
ALL_TESTS = REGULAR_TESTS + EXAMPLES_TESTS + PIPELINE_TESTS + REPO_UTIL_TESTS + DOC_TESTS + [custom_tokenizers_job] + [exotic_models_job] # fmt: skip
# ALL_TESTS = REGULAR_TESTS + EXAMPLES_TESTS + PIPELINE_TESTS + REPO_UTIL_TESTS + DOC_TESTS + [custom_tokenizers_job] + [exotic_models_job] # fmt: skip
ALL_TESTS = [torch_job]


def create_circleci_config(folder=None):
Expand Down
340 changes: 340 additions & 0 deletions pytest.txt

Large diffs are not rendered by default.

21 changes: 21 additions & 0 deletions pytest_failed.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
tests/models/wav2vec2/test_modeling_wav2vec2.py::Wav2Vec2RobustModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in Wav2Vec2ForPreTraining for key=projected_quantized_states.
tests/models/unispeech/test_modeling_unispeech.py::UniSpeechRobustModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in UniSpeechForPreTraining for key=projected_quantized_states.
tests/models/unispeech/test_modeling_unispeech.py::UniSpeechRobustModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in UniSpeechForPreTraining for key=projected_quantized_states.

tests/models/groupvit/test_modeling_groupvit.py::GroupViTModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in GroupViTModel for key=vision_model_output.
tests/models/groupvit/test_modeling_groupvit.py::GroupViTModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in GroupViTModel for key=logits_per_image.
tests/models/groupvit/test_modeling_groupvit.py::GroupViTModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in GroupViTModel for key=logits_per_image.
tests/models/groupvit/test_modeling_groupvit.py::GroupViTModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in GroupViTModel for key=vision_model_output.


tests/models/superpoint/test_modeling_superpoint.py::SuperPointModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in SuperPointForKeypointDetection for key=keypoints.
tests/models/oneformer/test_modeling_oneformer.py::OneFormerModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in OneFormerForUniversalSegmentation for key=transformer_decoder_object_queries.





tests/models/flaubert/test_modeling_flaubert.py::FlaubertModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in FlaubertForQuestionAnswering for key=end_top_index.
tests/models/xlm/test_modeling_xlm.py::XLMModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in XLMForQuestionAnswering for key=end_top_index.
tests/models/xlnet/test_modeling_xlnet.py::XLNetModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in XLNetForQuestionAnswering for key=end_top_index.
tests/models/xlnet/test_modeling_xlnet.py::XLNetModelTest::test_batching_equivalence - AssertionError: Batched and Single row outputs are not equal in XLNetForQuestionAnswering for key=end_top_index.
25 changes: 25 additions & 0 deletions src/transformers/models/wav2vec2/modeling_wav2vec2.py
Original file line number Diff line number Diff line change
Expand Up @@ -1194,6 +1194,7 @@ def __init__(self, config):

# can be decayed for training
self.temperature = 2
self.outputs = []

@staticmethod
def _compute_perplexity(probs, mask=None):
Expand All @@ -1210,9 +1211,16 @@ def _compute_perplexity(probs, mask=None):
def forward(self, hidden_states, mask_time_indices=None):
batch_size, sequence_length, hidden_size = hidden_states.shape

self.outputs.append(hidden_states)
self.outputs.append(mask_time_indices)

# project to codevector dim
hidden_states = self.weight_proj(hidden_states)
self.outputs.append(hidden_states)


hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)
self.outputs.append(hidden_states)

if self.training:
# sample code vector probs via gumbel in differentiateable way
Expand All @@ -1229,18 +1237,26 @@ def forward(self, hidden_states, mask_time_indices=None):
# take argmax in non-differentiable way
# comptute hard codevector distribution (one hot)
codevector_idx = hidden_states.argmax(dim=-1)
self.outputs.append(codevector_idx)
codevector_probs = hidden_states.new_zeros(hidden_states.shape).scatter_(
-1, codevector_idx.view(-1, 1), 1.0
)
self.outputs.append(codevector_probs)
codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
self.outputs.append(codevector_probs)

perplexity = self._compute_perplexity(codevector_probs, mask_time_indices)
self.outputs.append(perplexity)

codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
self.outputs.append(codevector_probs)
# use probs to retrieve codevectors
codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
self.outputs.append(codevectors_per_group)
codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
self.outputs.append(codevectors)
codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
self.outputs.append(codevectors)

return codevectors, perplexity

Expand Down Expand Up @@ -1858,6 +1874,8 @@ def __init__(self, config: Wav2Vec2Config):
# Initialize weights and apply final processing
self.post_init()

self.outputs = []

def set_gumbel_temperature(self, temperature: int):
"""
Set the Gumbel softmax temperature to a given value. Only necessary for training
Expand Down Expand Up @@ -1989,22 +2007,29 @@ def forward(

# 1. project all transformed features (including masked) to final vq dim
transformer_features = self.project_hid(outputs[0])
self.outputs.append(transformer_features)

# 2. quantize all (unmasked) extracted features and project to final vq dim
extract_features = self.dropout_features(outputs[1])
self.outputs.append(extract_features)

if attention_mask is not None:
# compute reduced attention_mask correponding to feature vectors
attention_mask = self._get_feature_vector_attention_mask(
extract_features.shape[1], attention_mask, add_adapter=False
)
self.outputs.append(attention_mask)

quantized_features, codevector_perplexity = self.quantizer(
extract_features, mask_time_indices=mask_time_indices
)
self.outputs.append(mask_time_indices)
self.outputs.append(quantized_features)

quantized_features = quantized_features.to(self.project_q.weight.dtype)
self.outputs.append(quantized_features)
quantized_features = self.project_q(quantized_features)
self.outputs.append(quantized_features)

loss = contrastive_loss = diversity_loss = None
if sampled_negative_indices is not None:
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/testing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1476,7 +1476,7 @@ def set_config_for_less_flaky_test(config):

def set_model_for_less_flaky_test(model):
# Another way to make sure norm layers have desired epsilon. (Some models don't set it from its config.)
target_names = ("LayerNorm", "GroupNorm", "BatchNorm", "RMSNorm", "BatchNorm2d", "BatchNorm1d")
target_names = ("LayerNorm", "GroupNorm", "BatchNorm", "RMSNorm", "BatchNorm2d", "BatchNorm1d", "BitGroupNormActivation", "WeightStandardizedConv2d")
target_attrs = ["eps", "epsilon", "variance_epsilon"]
if is_torch_available() and isinstance(model, torch.nn.Module):
for module in model.modules():
Expand Down
4 changes: 4 additions & 0 deletions tests/models/autoformer/test_modeling_autoformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,10 @@ def setUp(self):
self.model_tester = AutoformerModelTester(self)
self.config_tester = ConfigTester(self, config_class=AutoformerConfig, has_text_modality=False)

@is_flaky(description="The computation of `tmp_delay` in `AutoformerAttention.forward` seems wrong, see PR #12345. Also `topk` is used to compute indices which is not stable.")
def test_batching_equivalence(self):
super().test_batching_equivalence()

def test_config(self):
self.config_tester.run_common_tests()

Expand Down
5 changes: 5 additions & 0 deletions tests/models/dac/test_modeling_dac.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,11 @@ def test_model_forward(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model_forward(*config_and_inputs)

# TODO (ydshieh): Although we have a potential cause, it's still strange that this test fails all the time with large differences
@unittest.skip(reason="Might be caused by `indices` computed with `max()` in `decode_latents`")
def test_batching_equivalence(self):
super().test_batching_equivalence()

def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()

Expand Down
1 change: 0 additions & 1 deletion tests/models/dpt/test_modeling_dpt_hybrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,6 @@ def test_raise_readout_type(self):
with self.assertRaises(ValueError):
_ = DPTForDepthEstimation(config)

@is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
def test_batching_equivalence(self):
super().test_batching_equivalence()

Expand Down
6 changes: 5 additions & 1 deletion tests/models/esm/test_modeling_esmfold.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import unittest

from transformers import EsmConfig, is_torch_available
from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device
from transformers.testing_utils import TestCasePlus, is_flaky, require_torch, slow, torch_device

from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
Expand Down Expand Up @@ -184,6 +184,10 @@ def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)

@is_flaky(description="Unknown flaky reason ...")
def test_batching_equivalence(self):
super().test_batching_equivalence()

@unittest.skip(reason="Does not support attention outputs")
def test_attention_outputs(self):
pass
Expand Down
6 changes: 5 additions & 1 deletion tests/models/flaubert/test_modeling_flaubert.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import unittest

from transformers import FlaubertConfig, is_sacremoses_available, is_torch_available
from transformers.testing_utils import require_torch, require_torch_accelerator, slow, torch_device
from transformers.testing_utils import is_flaky, require_torch, require_torch_accelerator, slow, torch_device

from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
Expand Down Expand Up @@ -439,6 +439,10 @@ def test_flaubert_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_flaubert_model(*config_and_inputs)

@is_flaky(description="The indices computed with `topk()` in `SQuADHead` (of `FlaubertForQuestionAnswering`) is not stable.")
def test_batching_equivalence(self):
super().test_batching_equivalence()

# Copied from tests/models/distilbert/test_modeling_distilbert.py with Distilbert->Flaubert
def test_flaubert_model_with_sinusoidal_encodings(self):
config = FlaubertConfig(sinusoidal_embeddings=True)
Expand Down
10 changes: 9 additions & 1 deletion tests/models/groupvit/test_modeling_groupvit.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import requests

from transformers import GroupViTConfig, GroupViTTextConfig, GroupViTVisionConfig
from transformers.testing_utils import is_pt_tf_cross_test, require_torch, require_vision, slow, torch_device
from transformers.testing_utils import is_pt_tf_cross_test, require_torch, require_vision, slow, torch_device, is_flaky
from transformers.utils import is_torch_available, is_vision_available

from ...test_configuration_common import ConfigTester
Expand Down Expand Up @@ -162,6 +162,10 @@ def test_config(self):
def test_inputs_embeds(self):
pass

@is_flaky(description="The `index` computed with `max()` in `hard_softmax` is not stable.")
def test_batching_equivalence(self):
super().test_batching_equivalence()

@is_pt_tf_cross_test
def test_pt_tf_model_equivalence(self):
import tensorflow as tf
Expand Down Expand Up @@ -571,6 +575,10 @@ def test_model(self):
def test_config(self):
self.config_tester.run_common_tests()

@is_flaky(description="The `index` computed with `max()` in `hard_softmax` is not stable.")
def test_batching_equivalence(self):
super().test_batching_equivalence()

@unittest.skip(reason="hidden_states are tested in individual model tests")
def test_hidden_states_output(self):
pass
Expand Down
1 change: 0 additions & 1 deletion tests/models/mimi/test_modeling_mimi.py
Original file line number Diff line number Diff line change
Expand Up @@ -734,7 +734,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self):
def test_sdpa_can_compile_dynamic(self):
pass

@is_flaky()
def test_batching_equivalence(self):
super().test_batching_equivalence()

Expand Down
1 change: 0 additions & 1 deletion tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,6 @@ def test_model_from_pretrained(self):
model = MobileNetV1Model.from_pretrained(model_name)
self.assertIsNotNone(model)

@is_flaky(description="is_flaky https://github.com/huggingface/transformers/pull/31258")
def test_batching_equivalence(self):
super().test_batching_equivalence()

Expand Down
1 change: 0 additions & 1 deletion tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,6 @@ def test_model_from_pretrained(self):
model = MobileNetV2Model.from_pretrained(model_name)
self.assertIsNotNone(model)

@is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
def test_batching_equivalence(self):
super().test_batching_equivalence()

Expand Down
1 change: 0 additions & 1 deletion tests/models/mobilevit/test_modeling_mobilevit.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,6 @@ def test_model_from_pretrained(self):
model = MobileViTModel.from_pretrained(model_name)
self.assertIsNotNone(model)

@is_flaky(description="is_flaky https://github.com/huggingface/transformers/issues/29516")
def test_batching_equivalence(self):
super().test_batching_equivalence()

Expand Down
6 changes: 5 additions & 1 deletion tests/models/superpoint/test_modeling_superpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from typing import List

from transformers.models.superpoint.configuration_superpoint import SuperPointConfig
from transformers.testing_utils import require_torch, require_vision, slow, torch_device
from transformers.testing_utils import is_flaky, require_torch, require_vision, slow, torch_device
from transformers.utils import cached_property, is_torch_available, is_vision_available

from ...test_configuration_common import ConfigTester
Expand Down Expand Up @@ -135,6 +135,10 @@ def setUp(self):
def test_config(self):
self.config_tester.run_common_tests()

@is_flaky(description="The `indices` computed with `topk()` in `top_k_keypoints` is not stable.")
def test_batching_equivalence(self):
super().test_batching_equivalence()

@unittest.skip(reason="SuperPointForKeypointDetection does not use inputs_embeds")
def test_inputs_embeds(self):
pass
Expand Down
6 changes: 5 additions & 1 deletion tests/models/timm_backbone/test_modeling_timm_backbone.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import unittest

from transformers import AutoBackbone
from transformers.testing_utils import require_timm, require_torch, torch_device
from transformers.testing_utils import is_flaky, require_timm, require_torch, torch_device
from transformers.utils.import_utils import is_torch_available

from ...test_backbone_common import BackboneTesterMixin
Expand Down Expand Up @@ -115,6 +115,10 @@ def setUp(self):
def test_config(self):
self.config_tester.run_common_tests()

@is_flaky(description="`TimmBackbone` has no `_init_weights`. Timm's way of weight init. seems to give larger magnitude in the intermediate values during `forward`.")
def test_batching_equivalence(self):
super().test_batching_equivalence()

def test_timm_transformer_backbone_equivalence(self):
timm_checkpoint = "resnet18"
transformers_checkpoint = "microsoft/resnet-18"
Expand Down
6 changes: 5 additions & 1 deletion tests/models/unispeech/test_modeling_unispeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from datasets import load_dataset

from transformers import UniSpeechConfig, is_torch_available
from transformers.testing_utils import require_soundfile, require_torch, slow, torch_device
from transformers.testing_utils import is_flaky, require_soundfile, require_torch, slow, torch_device

from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
Expand Down Expand Up @@ -329,6 +329,10 @@ def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)

@is_flaky(description="The `codevector_idx` computed with `argmax()` in `UniSpeechGumbelVectorQuantizer.forward` is not stable.")
def test_batching_equivalence(self):
super().test_batching_equivalence()

def test_batched_inference(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_batch_inference(*config_and_inputs)
Expand Down
5 changes: 5 additions & 0 deletions tests/models/wav2vec2/test_modeling_wav2vec2.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from transformers.testing_utils import (
CaptureLogger,
cleanup,
is_flaky,
is_pt_flax_cross_test,
is_pyctcdecode_available,
is_torchaudio_available,
Expand Down Expand Up @@ -863,6 +864,10 @@ def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)

@is_flaky(description="The `codevector_idx` computed with `argmax()` in `Wav2Vec2GumbelVectorQuantizer.forward` is not stable.")
def test_batching_equivalence(self):
super().test_batching_equivalence()

def test_model_with_adapter(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
Expand Down
Loading
Loading