From 7f95372c6267d3163fd2aa74aeff9d84ddb6cc35 Mon Sep 17 00:00:00 2001 From: Victor Agostinelli <35327788+agostinv@users.noreply.github.com> Date: Fri, 6 Dec 2024 04:39:45 -0800 Subject: [PATCH 001/110] Add feature dim attributes to BitLinear for easier PEFT integration (#34946) Update bitnet.py, extremely small change to allow for easier PEFT integration Co-authored-by: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com> --- src/transformers/integrations/bitnet.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/transformers/integrations/bitnet.py b/src/transformers/integrations/bitnet.py index 3386bdcb43b27c..0b50f9738afb69 100644 --- a/src/transformers/integrations/bitnet.py +++ b/src/transformers/integrations/bitnet.py @@ -127,6 +127,8 @@ class BitLinear(nn.Module): def __init__(self, in_features: int, out_features: int, bias: bool, device=None, dtype=None): super().__init__() self.dtype = dtype + self.in_features = in_features + self.out_features = out_features self.register_buffer( "weight", torch.zeros( From c8c8dffbe45ebef0a8dba4a51024e5e5e498596b Mon Sep 17 00:00:00 2001 From: Pavel Iakubovskii Date: Fri, 6 Dec 2024 13:42:51 +0000 Subject: [PATCH 002/110] Update I-JEPA checkpoints path (#35120) Update checkpoints path --- docs/source/en/model_doc/ijepa.md | 2 +- src/transformers/models/ijepa/modular_ijepa.py | 2 +- tests/models/ijepa/test_modeling_ijepa.py | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/source/en/model_doc/ijepa.md b/docs/source/en/model_doc/ijepa.md index 9a0cd368a8188f..32944e2617eae1 100644 --- a/docs/source/en/model_doc/ijepa.md +++ b/docs/source/en/model_doc/ijepa.md @@ -45,7 +45,7 @@ url_2 = "http://images.cocodataset.org/val2017/000000219578.jpg" image_1 = Image.open(requests.get(url_1, stream=True).raw) image_2 = Image.open(requests.get(url_2, stream=True).raw) -model_id = "jmtzt/ijepa_vith14_1k" +model_id = "facebook/ijepa_vith14_1k" processor = AutoProcessor.from_pretrained(model_id) model = AutoModel.from_pretrained(model_id) diff --git a/src/transformers/models/ijepa/modular_ijepa.py b/src/transformers/models/ijepa/modular_ijepa.py index efbd71d91342fd..3b3756dd5ce697 100644 --- a/src/transformers/models/ijepa/modular_ijepa.py +++ b/src/transformers/models/ijepa/modular_ijepa.py @@ -155,7 +155,7 @@ def __init__(self, config: IJepaConfig, add_pooling_layer: bool = False, use_mas self.embeddings = IJepaEmbeddings(config, use_mask_token=use_mask_token) -_IMAGE_CLASS_CHECKPOINT = "jmtzt/ijepa_vith14_1k" +_IMAGE_CLASS_CHECKPOINT = "facebook/ijepa_vith14_1k" _IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat" diff --git a/tests/models/ijepa/test_modeling_ijepa.py b/tests/models/ijepa/test_modeling_ijepa.py index 27a79bc6724285..723ddcf7988826 100644 --- a/tests/models/ijepa/test_modeling_ijepa.py +++ b/tests/models/ijepa/test_modeling_ijepa.py @@ -250,7 +250,7 @@ def test_for_image_classification(self): @slow def test_model_from_pretrained(self): - model_name = "jmtzt/ijepa_vith14_1k" + model_name = "facebook/ijepa_vith14_1k" model = IJepaModel.from_pretrained(model_name) self.assertIsNotNone(model) @@ -266,11 +266,11 @@ def prepare_img(): class IJepaModelIntegrationTest(unittest.TestCase): @cached_property def default_image_processor(self): - return ViTImageProcessor.from_pretrained("jmtzt/ijepa_vith14_1k") if is_vision_available() else None + return ViTImageProcessor.from_pretrained("facebook/ijepa_vith14_1k") if is_vision_available() else None @slow def test_inference_no_head(self): - model = IJepaModel.from_pretrained("jmtzt/ijepa_vith14_1k").to(torch_device) + model = IJepaModel.from_pretrained("facebook/ijepa_vith14_1k").to(torch_device) image_processor = self.default_image_processor image = prepare_img() @@ -299,7 +299,7 @@ def test_inference_fp16(self): A small test to make sure that inference work in half precision without any problem. """ model = IJepaModel.from_pretrained( - "jmtzt/ijepa_vith14_1k", + "facebook/ijepa_vith14_1k", torch_dtype=torch.float16, device_map="auto", ) @@ -319,7 +319,7 @@ def test_inference_interpolate_pos_encoding(self): # allowing to interpolate the pre-trained position embeddings in order to use # the model on higher resolutions. The DINO model by Facebook AI leverages this # to visualize self-attention on higher resolution images. - model = IJepaModel.from_pretrained("jmtzt/ijepa_vith14_1k").to(torch_device) + model = IJepaModel.from_pretrained("facebook/ijepa_vith14_1k").to(torch_device) image_processor = self.default_image_processor image = prepare_img() From 1ccca8f48c493a4804921da66f65023e3f9c8d9c Mon Sep 17 00:00:00 2001 From: kang sheng Date: Mon, 9 Dec 2024 16:57:41 +0800 Subject: [PATCH 003/110] Fix GA loss bugs and add unit test (#35121) * fix GA bugs and add unit test * narrow down model loss unit test diff gap * format code to make ruff happy * send num_items_in_batch argument to decoder * fix GA loss bug in BertLMHeadModel * use TinyStories-33M to narrow down diff gap * fotmat code * missing .config * avoid add extra args --------- Co-authored-by: kangsheng --- src/transformers/models/bert/modeling_bert.py | 7 +- .../modeling_speech_encoder_decoder.py | 2 + src/transformers/trainer.py | 9 +- tests/trainer/test_trainer.py | 114 ++++++++++++++++-- 4 files changed, 108 insertions(+), 24 deletions(-) diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index 6b05fa648158a6..e311f93b6c81ed 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -1325,6 +1325,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + **loss_kwargs, ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): @@ -1375,11 +1376,7 @@ def forward( lm_loss = None if labels is not None: - # we are doing next-token prediction; shift prediction scores and input ids by one - shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() - labels = labels[:, 1:].contiguous() - loss_fct = CrossEntropyLoss() - lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + lm_loss = self.loss_function(prediction_scores, labels, self.config.vocab_size, **loss_kwargs) if not return_dict: output = (prediction_scores,) + outputs[2:] diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py index 0d2b911bebe582..3bff8f6acd290d 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py @@ -491,6 +491,8 @@ def forward( kwargs_decoder = { argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_") } + if "num_items_in_batch" in kwargs_encoder: + kwargs_decoder["num_items_in_batch"] = kwargs_encoder.pop("num_items_in_batch", None) if encoder_outputs is None: if inputs is None: diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index af908e48e4b8c4..f7d79481809807 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -3649,10 +3649,7 @@ def training_step( return loss_mb.reduce_mean().detach().to(self.args.device) with self.compute_loss_context_manager(): - if self.model_accepts_loss_kwargs: - loss = self.compute_loss(model, inputs) - else: - loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) + loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) del inputs if ( @@ -5132,10 +5129,6 @@ def get_batch_samples(self, epoch_iterator, num_batches): except StopIteration: break - # Keep default behavior the same - if not self.model_accepts_loss_kwargs: - return batch_samples, None - if len(batch_samples) > 0 and "labels" in batch_samples[0]: # For now we don't support object detection try: diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index f7b4a8637bff85..d33be2789761da 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -750,11 +750,102 @@ def test_model_init(self): self.check_trained_model(trainer.model, alternate_seed=True) @slow - def test_gradient_accumulation_loss_alignment(self): + def test_gradient_accumulation_loss_alignment_with_model_loss(self): set_seed(42) import datasets - model_name = "distilgpt2" + model_name = "nickypro/tinyllama-110M" + dataset_name = "wikitext" + dataset_config = "wikitext-2-raw-v1" + dataset = datasets.load_dataset(dataset_name, dataset_config, split="train[:500]") + dataset = dataset.train_test_split(test_size=0.2) + tokenizer = AutoTokenizer.from_pretrained(model_name) + + tokenizer.pad_token = tokenizer.eos_token + + def tokenize_function(examples): + return tokenizer(examples["text"], max_length=128, padding="max_length", truncation=True) + + tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names) + + data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) + + model = AutoModelForCausalLM.from_pretrained(model_name) + + base_loss_callback = StoreLossCallback() + + args_kwargs = { + "report_to": "none", + "logging_steps": 1, + "max_steps": 20, + "learning_rate": 3e-4, + "disable_tqdm": True, + } + + args = TrainingArguments( + "./generation", + **args_kwargs, + ) + trainer = Trainer( + model, + args, + train_dataset=tokenized_dataset["train"], + callbacks=[base_loss_callback], + data_collator=data_collator, + ) + assert trainer.model_accepts_loss_kwargs + trainer.train() + + grad_accum_loss_callback = StoreLossCallback() + args = TrainingArguments( + "./generation", + **args_kwargs, + gradient_accumulation_steps=2, + per_device_train_batch_size=4, + ) + set_seed(42) + model = AutoModelForCausalLM.from_pretrained(model_name) + trainer = Trainer( + model, + args, + train_dataset=tokenized_dataset["train"], + callbacks=[grad_accum_loss_callback], + data_collator=data_collator, + ) + trainer.train() + + set_seed(42) + model = AutoModelForCausalLM.from_pretrained(model_name) + broken_loss_callback = StoreLossCallback() + trainer = Trainer( + model, + args, + train_dataset=tokenized_dataset["train"], + callbacks=[broken_loss_callback], + data_collator=data_collator, + ) + # disable model_accepts_loss_kwargs + trainer.model_accepts_loss_kwargs = False + trainer.train() + + # Calculate the difference between the base loss and the grad_accum loss + diff_truth = [ + abs(base - grad) for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses) + ] + diff_broken = [abs(base - grad) for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses)] + + # all diff truth should be quite close + self.assertLess(max(diff_truth), 0.01, f"Difference {max(diff_truth)} is not within 0.01") + + # max diff broken should be very off + self.assertGreater(max(diff_broken), 3, f"Difference {max(diff_broken)} is not greater than 3") + + @slow + def test_gradient_accumulation_loss_alignment_with_loss_func(self): + set_seed(42) + import datasets + + model_name = "roneneldan/TinyStories-33M" dataset_name = "wikitext" dataset_config = "wikitext-2-raw-v1" dataset = datasets.load_dataset(dataset_name, dataset_config, split="train[:500]") @@ -836,15 +927,16 @@ def compute_loss(logits, labels, vocab_size, num_items_in_batch, disable_num_ite trainer.train() # Calculate the difference between the base loss and the grad_accum loss - diff_truth = [base - grad for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses)] - diff_broken = [base - grad for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses)] - # These should be quite close - for diff in diff_truth: - self.assertLess(abs(diff), 0.1, f"Difference {diff} is not within 0.1") - - # These should be very off - for diff in diff_broken: - self.assertGreater(abs(diff), 0.1, f"Difference {diff} is not greater than 0.1") + diff_truth = [ + abs(base - grad) for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses) + ] + diff_broken = [abs(base - grad) for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses)] + + # all diff truth should be quite close + self.assertLess(max(diff_truth), 0.01, f"Difference {max(diff_truth)} is not within 0.01") + + # max diff broken should be very off + self.assertGreater(max(diff_broken), 3, f"Difference {max(diff_broken)} is not greater than 3") def test_gradient_accumulation(self): # Training with half the batch size but accumulation steps as 2 should give the same training losses. From 9e420e02698f73a70ec1c99961f166c1b5df98bd Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Mon, 9 Dec 2024 10:01:31 +0100 Subject: [PATCH 004/110] [I-JEPA] Update docs (#35148) Update docs --- docs/source/en/model_doc/ijepa.md | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/docs/source/en/model_doc/ijepa.md b/docs/source/en/model_doc/ijepa.md index 32944e2617eae1..cb2afd25e20bca 100644 --- a/docs/source/en/model_doc/ijepa.md +++ b/docs/source/en/model_doc/ijepa.md @@ -18,13 +18,18 @@ rendered properly in your Markdown viewer. ## Overview -The I-JEPA model was proposed in [Image-based Joint-Embedding Predictive Architecture](https://arxiv.org/pdf/2301.08243.pdf) by Mahmoud Assran, Quentin Duval, Ishan Misra, Piotr Bojanowski, Pascal Vincent, Michael Rabbat, Yann LeCun, Nicolas Ballas. +The I-JEPA model was proposed in [Image-based Joint-Embedding Predictive Architecture](https://arxiv.org/abs/2301.08243) by Mahmoud Assran, Quentin Duval, Ishan Misra, Piotr Bojanowski, Pascal Vincent, Michael Rabbat, Yann LeCun, Nicolas Ballas. I-JEPA is a self-supervised learning method that predicts the representations of one part of an image based on other parts of the same image. This approach focuses on learning semantic features without relying on pre-defined invariances from hand-crafted data transformations, which can bias specific tasks, or on filling in pixel-level details, which often leads to less meaningful representations. The abstract from the paper is the following: This paper demonstrates an approach for learning highly semantic image representations without relying on hand-crafted data-augmentations. We introduce the Image- based Joint-Embedding Predictive Architecture (I-JEPA), a non-generative approach for self-supervised learning from images. The idea behind I-JEPA is simple: from a single context block, predict the representations of various target blocks in the same image. A core design choice to guide I-JEPA towards producing semantic representations is the masking strategy; specifically, it is crucial to (a) sample tar- get blocks with sufficiently large scale (semantic), and to (b) use a sufficiently informative (spatially distributed) context block. Empirically, when combined with Vision Transform- ers, we find I-JEPA to be highly scalable. For instance, we train a ViT-Huge/14 on ImageNet using 16 A100 GPUs in under 72 hours to achieve strong downstream performance across a wide range of tasks, from linear classification to object counting and depth prediction. + + + I-JEPA architecture. Taken from the original paper. + This model was contributed by [jmtzt](https://huggingface.co/jmtzt). The original code can be found [here](https://github.com/facebookresearch/ijepa). @@ -63,6 +68,15 @@ similarity = cosine_similarity(embed_1, embed_2) print(similarity) ``` +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with I-JEPA. + + + +- [`IJepaForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb). +- See also: [Image classification task guide](../tasks/image_classification) + ## IJepaConfig [[autodoc]] IJepaConfig @@ -75,4 +89,4 @@ print(similarity) ## IJepaForImageClassification [[autodoc]] IJepaForImageClassification - - forward + - forward \ No newline at end of file From 1452dc2514f14879d2662db06e2cd1227919bae9 Mon Sep 17 00:00:00 2001 From: UV Date: Mon, 9 Dec 2024 15:12:23 +0530 Subject: [PATCH 005/110] Corrected typo in agent system prompts (#35143) --- src/transformers/agents/prompts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/agents/prompts.py b/src/transformers/agents/prompts.py index 7a84b1db44faba..898a7e011a2b05 100644 --- a/src/transformers/agents/prompts.py +++ b/src/transformers/agents/prompts.py @@ -129,7 +129,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"): ``` --- -Above example were using tools that might not exist for you. You only have acces to those Tools: +Above example were using tools that might not exist for you. You only have access to these Tools: <> Remember to make sure that variables you use are all defined. @@ -256,7 +256,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"): } -Above example were using notional tools that might not exist for you. You only have acces to those tools: +Above example were using notional tools that might not exist for you. You only have access to these tools: <> Here are the rules you should always follow to solve your task: @@ -348,7 +348,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"): final_answer(pope_current_age) ``` -Above example were using notional tools that might not exist for you. On top of performing computations in the Python code snippets that you create, you have acces to those tools (and no other tool): +Above example were using notional tools that might not exist for you. On top of performing computations in the Python code snippets that you create, you have access to these tools (and no other tool): <> From de8a0b7547451f8f7ef2c0ac1f338ba77c614cec Mon Sep 17 00:00:00 2001 From: Daniel Bogdoll Date: Mon, 9 Dec 2024 05:29:04 -0500 Subject: [PATCH 006/110] Option to set 'non_blocking' for to(device) in BatchEncoding and BatchFeature (#34883) * Option to set 'non_blocking' for to(device) operation for performance improvements. Defaults to 'false', thus no behavioral changes. * Enabling non_blocking in to() operation of BatchFeature. * Improved docstring on utilization of non_blocking * Force non_blocking as keyword argument Co-authored-by: Pavel Iakubovskii --------- Co-authored-by: Daniel Bogdoll Co-authored-by: Pavel Iakubovskii --- src/transformers/feature_extraction_utils.py | 4 +++- src/transformers/tokenization_utils_base.py | 10 +++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py index f3cde8180c1bd4..6e8007edbc0b78 100644 --- a/src/transformers/feature_extraction_utils.py +++ b/src/transformers/feature_extraction_utils.py @@ -213,6 +213,7 @@ def to(self, *args, **kwargs) -> "BatchFeature": Will be passed to the `to(...)` function of the tensors. kwargs (`Dict`, *optional*): Will be passed to the `to(...)` function of the tensors. + To enable asynchronous data transfer, set the `non_blocking` flag in `kwargs` (defaults to `False`). Returns: [`BatchFeature`]: The same instance after modification. @@ -222,6 +223,7 @@ def to(self, *args, **kwargs) -> "BatchFeature": new_data = {} device = kwargs.get("device") + non_blocking = kwargs.get("non_blocking", False) # Check if the args are a device or a dtype if device is None and len(args) > 0: # device should be always the first argument @@ -241,7 +243,7 @@ def to(self, *args, **kwargs) -> "BatchFeature": # cast and send to device new_data[k] = v.to(*args, **kwargs) elif isinstance(v, torch.Tensor) and device is not None: - new_data[k] = v.to(device=device) + new_data[k] = v.to(device=device, non_blocking=non_blocking) else: new_data[k] = v self.data = new_data diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 0bfcc4aa303665..f4e5b9b3aaf314 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -799,12 +799,13 @@ def as_tensor(value, dtype=None): return self - def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding": + def to(self, device: Union[str, "torch.device"], *, non_blocking: bool = False) -> "BatchEncoding": """ - Send all values to device by calling `v.to(device)` (PyTorch only). + Send all values to device by calling `v.to(device, non_blocking=non_blocking)` (PyTorch only). Args: device (`str` or `torch.device`): The device to put the tensors on. + non_blocking (`bool`): Whether to perform the copy asynchronously. Returns: [`BatchEncoding`]: The same instance after modification. @@ -816,7 +817,10 @@ def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding": # Otherwise it passes the casts down and casts the LongTensor containing the token idxs # into a HalfTensor if isinstance(device, str) or is_torch_device(device) or isinstance(device, int): - self.data = {k: v.to(device=device) if isinstance(v, torch.Tensor) else v for k, v in self.data.items()} + self.data = { + k: v.to(device=device, non_blocking=non_blocking) if isinstance(v, torch.Tensor) else v + for k, v in self.data.items() + } else: logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.") return self From 7238387f67ab89c41502414e552c703a362d3bf5 Mon Sep 17 00:00:00 2001 From: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com> Date: Mon, 9 Dec 2024 14:13:36 +0100 Subject: [PATCH 007/110] Fix typo in EETQ Tests (#35160) fix --- tests/quantization/eetq_integration/test_eetq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/quantization/eetq_integration/test_eetq.py b/tests/quantization/eetq_integration/test_eetq.py index 2c01f8145cba0e..f14fa076e4bb76 100644 --- a/tests/quantization/eetq_integration/test_eetq.py +++ b/tests/quantization/eetq_integration/test_eetq.py @@ -119,7 +119,7 @@ def test_quantized_model_conversion(self): self.assertEqual(nb_linears - 1, nb_eetq_linear) - # Try with `linear_weights_not_to_quantize` + # Try with `modules_to_not_convert` with init_empty_weights(): model = OPTForCausalLM(config) quantization_config = EetqConfig(modules_to_not_convert=["fc1"]) @@ -128,7 +128,7 @@ def test_quantized_model_conversion(self): for module in model.modules(): if isinstance(module, EetqLinear): nb_eetq_linear += 1 - + # 25 corresponds to the lm_head along with 24 fc1 layers. self.assertEqual(nb_linears - 25, nb_eetq_linear) def test_quantized_model(self): From 8e806a336f53af4603fcc6868e6abd9058fbb8fd Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Mon, 9 Dec 2024 16:09:50 +0100 Subject: [PATCH 008/110] Cleanup: continue the init refactor (#35167) Round 2 --- .../audio_spectrogram_transformer/__init__.py | 48 +---- ...iguration_audio_spectrogram_transformer.py | 3 + ...xtraction_audio_spectrogram_transformer.py | 3 + .../modeling_audio_spectrogram_transformer.py | 3 + src/transformers/models/bark/__init__.py | 65 +----- .../models/bark/configuration_bark.py | 3 + src/transformers/models/bark/modeling_bark.py | 10 + .../models/bark/processing_bark.py | 3 + src/transformers/models/bart/__init__.py | 138 ++----------- .../models/bart/configuration_bart.py | 3 + src/transformers/models/bart/modeling_bart.py | 12 ++ .../models/bart/modeling_flax_bart.py | 11 + .../models/bart/modeling_tf_bart.py | 3 + .../models/bart/tokenization_bart.py | 3 + .../models/bart/tokenization_bart_fast.py | 3 + src/transformers/models/barthez/__init__.py | 46 +---- .../models/barthez/tokenization_barthez.py | 3 + .../barthez/tokenization_barthez_fast.py | 3 + src/transformers/models/bartpho/__init__.py | 28 +-- .../models/bartpho/tokenization_bartpho.py | 3 + src/transformers/models/beit/__init__.py | 101 +--------- .../models/beit/configuration_beit.py | 3 + .../models/beit/feature_extraction_beit.py | 3 + .../models/beit/image_processing_beit.py | 3 + src/transformers/models/beit/modeling_beit.py | 10 + .../models/beit/modeling_flax_beit.py | 8 + src/transformers/models/bert/__init__.py | 189 ++---------------- .../models/bert/configuration_bert.py | 3 + src/transformers/models/bert/modeling_bert.py | 16 ++ .../models/bert/modeling_flax_bert.py | 14 ++ .../models/bert/modeling_tf_bert.py | 16 ++ .../models/bert/tokenization_bert.py | 3 + .../models/bert/tokenization_bert_fast.py | 3 + .../models/bert/tokenization_bert_tf.py | 3 + .../models/bert_generation/__init__.py | 59 +----- .../configuration_bert_generation.py | 3 + .../modeling_bert_generation.py | 8 + .../tokenization_bert_generation.py | 3 + .../models/bert_japanese/__init__.py | 13 +- .../tokenization_bert_japanese.py | 3 + src/transformers/models/bertweet/__init__.py | 13 +- .../models/bertweet/tokenization_bertweet.py | 3 + 42 files changed, 263 insertions(+), 611 deletions(-) diff --git a/src/transformers/models/audio_spectrogram_transformer/__init__.py b/src/transformers/models/audio_spectrogram_transformer/__init__.py index 9f1d65e1aac839..3fe10d60c03a92 100644 --- a/src/transformers/models/audio_spectrogram_transformer/__init__.py +++ b/src/transformers/models/audio_spectrogram_transformer/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2021 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,47 +13,17 @@ # limitations under the License. from typing import TYPE_CHECKING -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available - - -_import_structure = { - "configuration_audio_spectrogram_transformer": ["ASTConfig"], - "feature_extraction_audio_spectrogram_transformer": ["ASTFeatureExtractor"], -} - -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_audio_spectrogram_transformer"] = [ - "ASTForAudioClassification", - "ASTModel", - "ASTPreTrainedModel", - ] +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure if TYPE_CHECKING: - from .configuration_audio_spectrogram_transformer import ( - ASTConfig, - ) - from .feature_extraction_audio_spectrogram_transformer import ASTFeatureExtractor - - try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_audio_spectrogram_transformer import ( - ASTForAudioClassification, - ASTModel, - ASTPreTrainedModel, - ) - - + from .configuration_audio_spectrogram_transformer import * + from .convert_audio_spectrogram_transformer_original_to_pytorch import * + from .feature_extraction_audio_spectrogram_transformer import * + from .modeling_audio_spectrogram_transformer import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py index 7980667a68d7c5..77bec930236f60 100644 --- a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py @@ -126,3 +126,6 @@ def __init__( # generative parameters deprecation cycle, overwriting this function prevents this from happening. def _get_non_default_generation_parameters(self) -> Dict[str, Any]: return {} + + +__all__ = ["ASTConfig"] diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py index 2bd122b4098c36..b181afe19e9ef8 100644 --- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py @@ -234,3 +234,6 @@ def __call__( padded_inputs = padded_inputs.convert_to_tensors(return_tensors) return padded_inputs + + +__all__ = ["ASTFeatureExtractor"] diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py index 491c6ce164611a..a9fe0d75f5c380 100644 --- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py @@ -670,3 +670,6 @@ def forward( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + + +__all__ = ["ASTForAudioClassification", "ASTModel", "ASTPreTrainedModel"] diff --git a/src/transformers/models/bark/__init__.py b/src/transformers/models/bark/__init__.py index 4cb1a606cf6567..6c21cf99976a15 100644 --- a/src/transformers/models/bark/__init__.py +++ b/src/transformers/models/bark/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,63 +13,18 @@ # limitations under the License. from typing import TYPE_CHECKING -from ...utils import ( - OptionalDependencyNotAvailable, - _LazyModule, - is_torch_available, -) +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure -_import_structure = { - "configuration_bark": [ - "BarkCoarseConfig", - "BarkConfig", - "BarkFineConfig", - "BarkSemanticConfig", - ], - "processing_bark": ["BarkProcessor"], -} - -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_bark"] = [ - "BarkFineModel", - "BarkSemanticModel", - "BarkCoarseModel", - "BarkModel", - "BarkPreTrainedModel", - "BarkCausalModel", - ] - if TYPE_CHECKING: - from .configuration_bark import ( - BarkCoarseConfig, - BarkConfig, - BarkFineConfig, - BarkSemanticConfig, - ) - from .processing_bark import BarkProcessor - - try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_bark import ( - BarkCausalModel, - BarkCoarseModel, - BarkFineModel, - BarkModel, - BarkPreTrainedModel, - BarkSemanticModel, - ) - + from .configuration_bark import * + from .convert_suno_to_hf import * + from .generation_configuration_bark import * + from .modeling_bark import * + from .processing_bark import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/bark/configuration_bark.py b/src/transformers/models/bark/configuration_bark.py index a498d1dd19371d..932bad618aa187 100644 --- a/src/transformers/models/bark/configuration_bark.py +++ b/src/transformers/models/bark/configuration_bark.py @@ -298,3 +298,6 @@ def from_sub_model_configs( codec_config=codec_config.to_dict(), **kwargs, ) + + +__all__ = ["BarkCoarseConfig", "BarkConfig", "BarkFineConfig", "BarkSemanticConfig"] diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py index f1c77367e5beb7..9e225ac9ae15c0 100644 --- a/src/transformers/models/bark/modeling_bark.py +++ b/src/transformers/models/bark/modeling_bark.py @@ -1819,3 +1819,13 @@ def _check_and_enable_flash_attn_2( config.coarse_acoustics_config._attn_implementation = config._attn_implementation config.fine_acoustics_config._attn_implementation = config._attn_implementation return config + + +__all__ = [ + "BarkFineModel", + "BarkSemanticModel", + "BarkCoarseModel", + "BarkModel", + "BarkPreTrainedModel", + "BarkCausalModel", +] diff --git a/src/transformers/models/bark/processing_bark.py b/src/transformers/models/bark/processing_bark.py index 53715f3260422c..0bed6ca79f410b 100644 --- a/src/transformers/models/bark/processing_bark.py +++ b/src/transformers/models/bark/processing_bark.py @@ -285,3 +285,6 @@ def __call__( encoded_text["history_prompt"] = voice_preset return encoded_text + + +__all__ = ["BarkProcessor"] diff --git a/src/transformers/models/bart/__init__.py b/src/transformers/models/bart/__init__.py index d538fbb7d34304..11c3f4863f46a1 100644 --- a/src/transformers/models/bart/__init__.py +++ b/src/transformers/models/bart/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,134 +13,20 @@ # limitations under the License. from typing import TYPE_CHECKING -from ...utils import ( - OptionalDependencyNotAvailable, - _LazyModule, - is_flax_available, - is_tf_available, - is_tokenizers_available, - is_torch_available, -) +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure -_import_structure = { - "configuration_bart": ["BartConfig", "BartOnnxConfig"], - "tokenization_bart": ["BartTokenizer"], -} - -try: - if not is_tokenizers_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["tokenization_bart_fast"] = ["BartTokenizerFast"] - -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_bart"] = [ - "BartForCausalLM", - "BartForConditionalGeneration", - "BartForQuestionAnswering", - "BartForSequenceClassification", - "BartModel", - "BartPreTrainedModel", - "BartPretrainedModel", - "PretrainedBartModel", - ] - -try: - if not is_tf_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_tf_bart"] = [ - "TFBartForConditionalGeneration", - "TFBartForSequenceClassification", - "TFBartModel", - "TFBartPretrainedModel", - ] - -try: - if not is_flax_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_flax_bart"] = [ - "FlaxBartDecoderPreTrainedModel", - "FlaxBartForCausalLM", - "FlaxBartForConditionalGeneration", - "FlaxBartForQuestionAnswering", - "FlaxBartForSequenceClassification", - "FlaxBartModel", - "FlaxBartPreTrainedModel", - ] - if TYPE_CHECKING: - from .configuration_bart import BartConfig, BartOnnxConfig - from .tokenization_bart import BartTokenizer - - try: - if not is_tokenizers_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .tokenization_bart_fast import BartTokenizerFast - - try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_bart import ( - BartForCausalLM, - BartForConditionalGeneration, - BartForQuestionAnswering, - BartForSequenceClassification, - BartModel, - BartPreTrainedModel, - BartPretrainedModel, - PretrainedBartModel, - ) - - try: - if not is_tf_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_tf_bart import ( - TFBartForConditionalGeneration, - TFBartForSequenceClassification, - TFBartModel, - TFBartPretrainedModel, - ) - - try: - if not is_flax_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_flax_bart import ( - FlaxBartDecoderPreTrainedModel, - FlaxBartForCausalLM, - FlaxBartForConditionalGeneration, - FlaxBartForQuestionAnswering, - FlaxBartForSequenceClassification, - FlaxBartModel, - FlaxBartPreTrainedModel, - ) - + from .configuration_bart import * + from .convert_bart_original_pytorch_checkpoint_to_pytorch import * + from .modeling_bart import * + from .modeling_flax_bart import * + from .modeling_tf_bart import * + from .tokenization_bart import * + from .tokenization_bart_fast import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/bart/configuration_bart.py b/src/transformers/models/bart/configuration_bart.py index a3bc7f38653a8a..4ce4316e3c0315 100644 --- a/src/transformers/models/bart/configuration_bart.py +++ b/src/transformers/models/bart/configuration_bart.py @@ -400,3 +400,6 @@ def _flatten_past_key_values_(self, flattened_output, name, idx, t): flattened_output = super(OnnxSeq2SeqConfigWithPast, self)._flatten_past_key_values_( flattened_output, name, idx, t ) + + +__all__ = ["BartConfig", "BartOnnxConfig"] diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index 07c1fa622ea3b6..dd1b69c8127fb8 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -2158,3 +2158,15 @@ def _reorder_cache(past_key_values, beam_idx): tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), ) return reordered_past + + +__all__ = [ + "BartForCausalLM", + "BartForConditionalGeneration", + "BartForQuestionAnswering", + "BartForSequenceClassification", + "BartModel", + "BartPreTrainedModel", + "BartPretrainedModel", + "PretrainedBartModel", +] diff --git a/src/transformers/models/bart/modeling_flax_bart.py b/src/transformers/models/bart/modeling_flax_bart.py index 634c256fe7d81d..b346eaa39fc199 100644 --- a/src/transformers/models/bart/modeling_flax_bart.py +++ b/src/transformers/models/bart/modeling_flax_bart.py @@ -1993,3 +1993,14 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs): FlaxCausalLMOutputWithCrossAttentions, _CONFIG_FOR_DOC, ) + + +__all__ = [ + "FlaxBartDecoderPreTrainedModel", + "FlaxBartForCausalLM", + "FlaxBartForConditionalGeneration", + "FlaxBartForQuestionAnswering", + "FlaxBartForSequenceClassification", + "FlaxBartModel", + "FlaxBartPreTrainedModel", +] diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py index 5ebde8cba60c45..7ab9817986e6ad 100644 --- a/src/transformers/models/bart/modeling_tf_bart.py +++ b/src/transformers/models/bart/modeling_tf_bart.py @@ -1709,3 +1709,6 @@ def build(self, input_shape=None): if getattr(self, "classification_head", None) is not None: with tf.name_scope(self.classification_head.name): self.classification_head.build(None) + + +__all__ = ["TFBartForConditionalGeneration", "TFBartForSequenceClassification", "TFBartModel", "TFBartPretrainedModel"] diff --git a/src/transformers/models/bart/tokenization_bart.py b/src/transformers/models/bart/tokenization_bart.py index 5207b9c92b07ff..4c516cb81be0d2 100644 --- a/src/transformers/models/bart/tokenization_bart.py +++ b/src/transformers/models/bart/tokenization_bart.py @@ -388,3 +388,6 @@ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()): text = " " + text return (text, kwargs) + + +__all__ = ["BartTokenizer"] diff --git a/src/transformers/models/bart/tokenization_bart_fast.py b/src/transformers/models/bart/tokenization_bart_fast.py index e9fb8497c907b9..4586ab4797e5ec 100644 --- a/src/transformers/models/bart/tokenization_bart_fast.py +++ b/src/transformers/models/bart/tokenization_bart_fast.py @@ -274,3 +274,6 @@ def create_token_type_ids_from_sequences( if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + +__all__ = ["BartTokenizerFast"] diff --git a/src/transformers/models/barthez/__init__.py b/src/transformers/models/barthez/__init__.py index 084cd22bdf1d88..323fe2fe8af982 100644 --- a/src/transformers/models/barthez/__init__.py +++ b/src/transformers/models/barthez/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,49 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import TYPE_CHECKING -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_tokenizers_available - - -_import_structure = {} - -try: - if not is_sentencepiece_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["tokenization_barthez"] = ["BarthezTokenizer"] - -try: - if not is_tokenizers_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["tokenization_barthez_fast"] = ["BarthezTokenizerFast"] +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure if TYPE_CHECKING: - try: - if not is_sentencepiece_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .tokenization_barthez import BarthezTokenizer - - try: - if not is_tokenizers_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .tokenization_barthez_fast import BarthezTokenizerFast - + from .tokenization_barthez import * + from .tokenization_barthez_fast import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/barthez/tokenization_barthez.py b/src/transformers/models/barthez/tokenization_barthez.py index 46decddb3e10ba..604f9c7c21519a 100644 --- a/src/transformers/models/barthez/tokenization_barthez.py +++ b/src/transformers/models/barthez/tokenization_barthez.py @@ -284,3 +284,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = fi.write(content_spiece_model) return (out_vocab_file,) + + +__all__ = ["BarthezTokenizer"] diff --git a/src/transformers/models/barthez/tokenization_barthez_fast.py b/src/transformers/models/barthez/tokenization_barthez_fast.py index df8cc7757e96c0..a1d95ef03e4882 100644 --- a/src/transformers/models/barthez/tokenization_barthez_fast.py +++ b/src/transformers/models/barthez/tokenization_barthez_fast.py @@ -192,3 +192,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = copyfile(self.vocab_file, out_vocab_file) return (out_vocab_file,) + + +__all__ = ["BarthezTokenizerFast"] diff --git a/src/transformers/models/bartpho/__init__.py b/src/transformers/models/bartpho/__init__.py index c20d7370c6566c..597be95d8175ca 100644 --- a/src/transformers/models/bartpho/__init__.py +++ b/src/transformers/models/bartpho/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2021 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,32 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import TYPE_CHECKING -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available - +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure -_import_structure = {} - -try: - if not is_sentencepiece_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["tokenization_bartpho"] = ["BartphoTokenizer"] if TYPE_CHECKING: - try: - if not is_sentencepiece_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .tokenization_bartpho import BartphoTokenizer - + from .tokenization_bartpho import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/bartpho/tokenization_bartpho.py b/src/transformers/models/bartpho/tokenization_bartpho.py index df121f26e255f4..e6e4f889842e8f 100644 --- a/src/transformers/models/bartpho/tokenization_bartpho.py +++ b/src/transformers/models/bartpho/tokenization_bartpho.py @@ -311,3 +311,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = fp.write(f"{str(token)} \n") return out_vocab_file, out_monolingual_vocab_file + + +__all__ = ["BartphoTokenizer"] diff --git a/src/transformers/models/beit/__init__.py b/src/transformers/models/beit/__init__.py index c2f49240d6e64c..0fc8919c7ea19a 100644 --- a/src/transformers/models/beit/__init__.py +++ b/src/transformers/models/beit/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2021 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,100 +11,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import TYPE_CHECKING -from ...utils import ( - OptionalDependencyNotAvailable, - _LazyModule, - is_flax_available, - is_torch_available, - is_vision_available, -) - - -_import_structure = {"configuration_beit": ["BeitConfig", "BeitOnnxConfig"]} - -try: - if not is_vision_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["feature_extraction_beit"] = ["BeitFeatureExtractor"] - _import_structure["image_processing_beit"] = ["BeitImageProcessor"] - -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_beit"] = [ - "BeitForImageClassification", - "BeitForMaskedImageModeling", - "BeitForSemanticSegmentation", - "BeitModel", - "BeitPreTrainedModel", - "BeitBackbone", - ] - +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure -try: - if not is_flax_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_flax_beit"] = [ - "FlaxBeitForImageClassification", - "FlaxBeitForMaskedImageModeling", - "FlaxBeitModel", - "FlaxBeitPreTrainedModel", - ] if TYPE_CHECKING: - from .configuration_beit import BeitConfig, BeitOnnxConfig - - try: - if not is_vision_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .feature_extraction_beit import BeitFeatureExtractor - from .image_processing_beit import BeitImageProcessor - - try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_beit import ( - BeitBackbone, - BeitForImageClassification, - BeitForMaskedImageModeling, - BeitForSemanticSegmentation, - BeitModel, - BeitPreTrainedModel, - ) - - try: - if not is_flax_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_flax_beit import ( - FlaxBeitForImageClassification, - FlaxBeitForMaskedImageModeling, - FlaxBeitModel, - FlaxBeitPreTrainedModel, - ) - - + from .configuration_beit import * + from .convert_beit_unilm_to_pytorch import * + from .feature_extraction_beit import * + from .image_processing_beit import * + from .modeling_beit import * + from .modeling_flax_beit import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/beit/configuration_beit.py b/src/transformers/models/beit/configuration_beit.py index f0f3c2582c35cc..834988258c6b75 100644 --- a/src/transformers/models/beit/configuration_beit.py +++ b/src/transformers/models/beit/configuration_beit.py @@ -224,3 +224,6 @@ def inputs(self) -> Mapping[str, Mapping[int, str]]: @property def atol_for_validation(self) -> float: return 1e-4 + + +__all__ = ["BeitConfig", "BeitOnnxConfig"] diff --git a/src/transformers/models/beit/feature_extraction_beit.py b/src/transformers/models/beit/feature_extraction_beit.py index 59dacb4ae51f6e..141d8bc36d2bbb 100644 --- a/src/transformers/models/beit/feature_extraction_beit.py +++ b/src/transformers/models/beit/feature_extraction_beit.py @@ -31,3 +31,6 @@ def __init__(self, *args, **kwargs) -> None: FutureWarning, ) super().__init__(*args, **kwargs) + + +__all__ = ["BeitFeatureExtractor"] diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py index 7398381b2229bf..af76dd2e9656cb 100644 --- a/src/transformers/models/beit/image_processing_beit.py +++ b/src/transformers/models/beit/image_processing_beit.py @@ -510,3 +510,6 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])] return semantic_segmentation + + +__all__ = ["BeitImageProcessor"] diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py index f972e021f3e2b3..01c16ca2cf000b 100755 --- a/src/transformers/models/beit/modeling_beit.py +++ b/src/transformers/models/beit/modeling_beit.py @@ -1576,3 +1576,13 @@ def forward( hidden_states=outputs.hidden_states if output_hidden_states else None, attentions=outputs.attentions, ) + + +__all__ = [ + "BeitForImageClassification", + "BeitForMaskedImageModeling", + "BeitForSemanticSegmentation", + "BeitModel", + "BeitPreTrainedModel", + "BeitBackbone", +] diff --git a/src/transformers/models/beit/modeling_flax_beit.py b/src/transformers/models/beit/modeling_flax_beit.py index c1da64d263a266..2d79c1820088a1 100644 --- a/src/transformers/models/beit/modeling_flax_beit.py +++ b/src/transformers/models/beit/modeling_flax_beit.py @@ -946,3 +946,11 @@ class FlaxBeitForImageClassification(FlaxBeitPreTrainedModel): append_replace_return_docstrings( FlaxBeitForImageClassification, output_type=FlaxSequenceClassifierOutput, config_class=BeitConfig ) + + +__all__ = [ + "FlaxBeitForImageClassification", + "FlaxBeitForMaskedImageModeling", + "FlaxBeitModel", + "FlaxBeitPreTrainedModel", +] diff --git a/src/transformers/models/bert/__init__.py b/src/transformers/models/bert/__init__.py index 17048a5d1c967a..3ed12a889321e6 100644 --- a/src/transformers/models/bert/__init__.py +++ b/src/transformers/models/bert/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,183 +11,26 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import TYPE_CHECKING -from ...utils import ( - OptionalDependencyNotAvailable, - _LazyModule, - is_flax_available, - is_tensorflow_text_available, - is_tf_available, - is_tokenizers_available, - is_torch_available, -) - - -_import_structure = { - "configuration_bert": ["BertConfig", "BertOnnxConfig"], - "tokenization_bert": ["BasicTokenizer", "BertTokenizer", "WordpieceTokenizer"], -} - -try: - if not is_tokenizers_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["tokenization_bert_fast"] = ["BertTokenizerFast"] +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_bert"] = [ - "BertForMaskedLM", - "BertForMultipleChoice", - "BertForNextSentencePrediction", - "BertForPreTraining", - "BertForQuestionAnswering", - "BertForSequenceClassification", - "BertForTokenClassification", - "BertLayer", - "BertLMHeadModel", - "BertModel", - "BertPreTrainedModel", - "load_tf_weights_in_bert", - ] - -try: - if not is_tf_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_tf_bert"] = [ - "TFBertEmbeddings", - "TFBertForMaskedLM", - "TFBertForMultipleChoice", - "TFBertForNextSentencePrediction", - "TFBertForPreTraining", - "TFBertForQuestionAnswering", - "TFBertForSequenceClassification", - "TFBertForTokenClassification", - "TFBertLMHeadModel", - "TFBertMainLayer", - "TFBertModel", - "TFBertPreTrainedModel", - ] -try: - if not is_tensorflow_text_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["tokenization_bert_tf"] = ["TFBertTokenizer"] - -try: - if not is_flax_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_flax_bert"] = [ - "FlaxBertForCausalLM", - "FlaxBertForMaskedLM", - "FlaxBertForMultipleChoice", - "FlaxBertForNextSentencePrediction", - "FlaxBertForPreTraining", - "FlaxBertForQuestionAnswering", - "FlaxBertForSequenceClassification", - "FlaxBertForTokenClassification", - "FlaxBertModel", - "FlaxBertPreTrainedModel", - ] if TYPE_CHECKING: - from .configuration_bert import BertConfig, BertOnnxConfig - from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer - - try: - if not is_tokenizers_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .tokenization_bert_fast import BertTokenizerFast - - try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_bert import ( - BertForMaskedLM, - BertForMultipleChoice, - BertForNextSentencePrediction, - BertForPreTraining, - BertForQuestionAnswering, - BertForSequenceClassification, - BertForTokenClassification, - BertLayer, - BertLMHeadModel, - BertModel, - BertPreTrainedModel, - load_tf_weights_in_bert, - ) - - try: - if not is_tf_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_tf_bert import ( - TFBertEmbeddings, - TFBertForMaskedLM, - TFBertForMultipleChoice, - TFBertForNextSentencePrediction, - TFBertForPreTraining, - TFBertForQuestionAnswering, - TFBertForSequenceClassification, - TFBertForTokenClassification, - TFBertLMHeadModel, - TFBertMainLayer, - TFBertModel, - TFBertPreTrainedModel, - ) - - try: - if not is_tensorflow_text_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .tokenization_bert_tf import TFBertTokenizer - - try: - if not is_flax_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_flax_bert import ( - FlaxBertForCausalLM, - FlaxBertForMaskedLM, - FlaxBertForMultipleChoice, - FlaxBertForNextSentencePrediction, - FlaxBertForPreTraining, - FlaxBertForQuestionAnswering, - FlaxBertForSequenceClassification, - FlaxBertForTokenClassification, - FlaxBertModel, - FlaxBertPreTrainedModel, - ) - + from .configuration_bert import * + from .convert_bert_original_tf2_checkpoint_to_pytorch import * + from .convert_bert_original_tf_checkpoint_to_pytorch import * + from .convert_bert_pytorch_checkpoint_to_original_tf import * + from .convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch import * + from .modeling_bert import * + from .modeling_flax_bert import * + from .modeling_tf_bert import * + from .tokenization_bert import * + from .tokenization_bert_fast import * + from .tokenization_bert_tf import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/bert/configuration_bert.py b/src/transformers/models/bert/configuration_bert.py index 613cf6a11463c2..ea29fb81c435aa 100644 --- a/src/transformers/models/bert/configuration_bert.py +++ b/src/transformers/models/bert/configuration_bert.py @@ -149,3 +149,6 @@ def inputs(self) -> Mapping[str, Mapping[int, str]]: ("token_type_ids", dynamic_axis), ] ) + + +__all__ = ["BertConfig", "BertOnnxConfig"] diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index e311f93b6c81ed..0c53963cee7922 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -1991,3 +1991,19 @@ def forward( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + + +__all__ = [ + "BertForMaskedLM", + "BertForMultipleChoice", + "BertForNextSentencePrediction", + "BertForPreTraining", + "BertForQuestionAnswering", + "BertForSequenceClassification", + "BertForTokenClassification", + "BertLayer", + "BertLMHeadModel", + "BertModel", + "BertPreTrainedModel", + "load_tf_weights_in_bert", +] diff --git a/src/transformers/models/bert/modeling_flax_bert.py b/src/transformers/models/bert/modeling_flax_bert.py index 772ea2bf12b2ee..83358c86bd280d 100644 --- a/src/transformers/models/bert/modeling_flax_bert.py +++ b/src/transformers/models/bert/modeling_flax_bert.py @@ -1711,3 +1711,17 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs): FlaxCausalLMOutputWithCrossAttentions, _CONFIG_FOR_DOC, ) + + +__all__ = [ + "FlaxBertForCausalLM", + "FlaxBertForMaskedLM", + "FlaxBertForMultipleChoice", + "FlaxBertForNextSentencePrediction", + "FlaxBertForPreTraining", + "FlaxBertForQuestionAnswering", + "FlaxBertForSequenceClassification", + "FlaxBertForTokenClassification", + "FlaxBertModel", + "FlaxBertPreTrainedModel", +] diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py index bb3281278adaa1..ce862194dc7787 100644 --- a/src/transformers/models/bert/modeling_tf_bert.py +++ b/src/transformers/models/bert/modeling_tf_bert.py @@ -2108,3 +2108,19 @@ def build(self, input_shape=None): if getattr(self, "qa_outputs", None) is not None: with tf.name_scope(self.qa_outputs.name): self.qa_outputs.build([None, None, self.config.hidden_size]) + + +__all__ = [ + "TFBertEmbeddings", + "TFBertForMaskedLM", + "TFBertForMultipleChoice", + "TFBertForNextSentencePrediction", + "TFBertForPreTraining", + "TFBertForQuestionAnswering", + "TFBertForSequenceClassification", + "TFBertForTokenClassification", + "TFBertLMHeadModel", + "TFBertMainLayer", + "TFBertModel", + "TFBertPreTrainedModel", +] diff --git a/src/transformers/models/bert/tokenization_bert.py b/src/transformers/models/bert/tokenization_bert.py index 07583b949661de..42d4dd94554d41 100644 --- a/src/transformers/models/bert/tokenization_bert.py +++ b/src/transformers/models/bert/tokenization_bert.py @@ -502,3 +502,6 @@ def tokenize(self, text): else: output_tokens.extend(sub_tokens) return output_tokens + + +__all__ = ["BasicTokenizer", "BertTokenizer", "WordpieceTokenizer"] diff --git a/src/transformers/models/bert/tokenization_bert_fast.py b/src/transformers/models/bert/tokenization_bert_fast.py index f4897772847029..4a89e6053b988f 100644 --- a/src/transformers/models/bert/tokenization_bert_fast.py +++ b/src/transformers/models/bert/tokenization_bert_fast.py @@ -170,3 +170,6 @@ def create_token_type_ids_from_sequences( def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: files = self._tokenizer.model.save(save_directory, name=filename_prefix) return tuple(files) + + +__all__ = ["BertTokenizerFast"] diff --git a/src/transformers/models/bert/tokenization_bert_tf.py b/src/transformers/models/bert/tokenization_bert_tf.py index ebf88eeac9bbe8..b1f49722fbdffa 100644 --- a/src/transformers/models/bert/tokenization_bert_tf.py +++ b/src/transformers/models/bert/tokenization_bert_tf.py @@ -252,3 +252,6 @@ def get_config(self): "sep_token_id": self.sep_token_id, "pad_token_id": self.pad_token_id, } + + +__all__ = ["TFBertTokenizer"] diff --git a/src/transformers/models/bert_generation/__init__.py b/src/transformers/models/bert_generation/__init__.py index 14cf8bb5879320..3f83b1f6e5bba3 100644 --- a/src/transformers/models/bert_generation/__init__.py +++ b/src/transformers/models/bert_generation/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,61 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import TYPE_CHECKING -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_torch_available - - -_import_structure = {"configuration_bert_generation": ["BertGenerationConfig"]} - -try: - if not is_sentencepiece_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["tokenization_bert_generation"] = ["BertGenerationTokenizer"] - -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_bert_generation"] = [ - "BertGenerationDecoder", - "BertGenerationEncoder", - "BertGenerationPreTrainedModel", - "load_tf_weights_in_bert_generation", - ] +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure if TYPE_CHECKING: - from .configuration_bert_generation import BertGenerationConfig - - try: - if not is_sentencepiece_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .tokenization_bert_generation import BertGenerationTokenizer - - try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_bert_generation import ( - BertGenerationDecoder, - BertGenerationEncoder, - BertGenerationPreTrainedModel, - load_tf_weights_in_bert_generation, - ) - + from .configuration_bert_generation import * + from .modeling_bert_generation import * + from .tokenization_bert_generation import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/bert_generation/configuration_bert_generation.py b/src/transformers/models/bert_generation/configuration_bert_generation.py index d1d1b51b6538e2..1abe7c1a1c44ab 100644 --- a/src/transformers/models/bert_generation/configuration_bert_generation.py +++ b/src/transformers/models/bert_generation/configuration_bert_generation.py @@ -122,3 +122,6 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.position_embedding_type = position_embedding_type self.use_cache = use_cache + + +__all__ = ["BertGenerationConfig"] diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py index 800ea2bef1d631..aaf326aa2de8eb 100755 --- a/src/transformers/models/bert_generation/modeling_bert_generation.py +++ b/src/transformers/models/bert_generation/modeling_bert_generation.py @@ -996,3 +996,11 @@ def _reorder_cache(self, past_key_values, beam_idx): tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), ) return reordered_past + + +__all__ = [ + "BertGenerationDecoder", + "BertGenerationEncoder", + "BertGenerationPreTrainedModel", + "load_tf_weights_in_bert_generation", +] diff --git a/src/transformers/models/bert_generation/tokenization_bert_generation.py b/src/transformers/models/bert_generation/tokenization_bert_generation.py index b1adb9b62b2551..31f046863c289c 100644 --- a/src/transformers/models/bert_generation/tokenization_bert_generation.py +++ b/src/transformers/models/bert_generation/tokenization_bert_generation.py @@ -170,3 +170,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = fi.write(content_spiece_model) return (out_vocab_file,) + + +__all__ = ["BertGenerationTokenizer"] diff --git a/src/transformers/models/bert_japanese/__init__.py b/src/transformers/models/bert_japanese/__init__.py index a569c3cc54bff8..f5296087db1d00 100644 --- a/src/transformers/models/bert_japanese/__init__.py +++ b/src/transformers/models/bert_japanese/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,19 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import TYPE_CHECKING from ...utils import _LazyModule - - -_import_structure = {"tokenization_bert_japanese": ["BertJapaneseTokenizer", "CharacterTokenizer", "MecabTokenizer"]} +from ...utils.import_utils import define_import_structure if TYPE_CHECKING: - from .tokenization_bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer - + from .tokenization_bert_japanese import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py index 732e9e7aff5741..8a841a3091623d 100644 --- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py +++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py @@ -977,3 +977,6 @@ def tokenize(self, text): new_pieces.append(piece) return new_pieces + + +__all__ = ["BertJapaneseTokenizer", "CharacterTokenizer", "MecabTokenizer"] diff --git a/src/transformers/models/bertweet/__init__.py b/src/transformers/models/bertweet/__init__.py index 42e4a23337c20c..432622f1595d1a 100644 --- a/src/transformers/models/bertweet/__init__.py +++ b/src/transformers/models/bertweet/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,19 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import TYPE_CHECKING from ...utils import _LazyModule - - -_import_structure = {"tokenization_bertweet": ["BertweetTokenizer"]} +from ...utils.import_utils import define_import_structure if TYPE_CHECKING: - from .tokenization_bertweet import BertweetTokenizer - + from .tokenization_bertweet import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/bertweet/tokenization_bertweet.py b/src/transformers/models/bertweet/tokenization_bertweet.py index f478dd0832b6e4..499238e5955fe0 100644 --- a/src/transformers/models/bertweet/tokenization_bertweet.py +++ b/src/transformers/models/bertweet/tokenization_bertweet.py @@ -764,3 +764,6 @@ def casual_tokenize(text, preserve_case=True, reduce_len=False, strip_handles=Fa ############################################################################### + + +__all__ = ["BertweetTokenizer"] From 4bc39de5c34d4ffed42af4ddbf7bce0ed8a8bba6 Mon Sep 17 00:00:00 2001 From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Date: Tue, 10 Dec 2024 00:31:32 +0800 Subject: [PATCH 009/110] Super tiny fix logging message (#35132) Update integration_utils.py --- src/transformers/integrations/integration_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py index 0cc2685a55206f..e1f3bccb842fd1 100755 --- a/src/transformers/integrations/integration_utils.py +++ b/src/transformers/integrations/integration_utils.py @@ -1235,7 +1235,7 @@ def setup(self, args, state, model): logger.debug( f"MLflow experiment_name={self._experiment_name}, run_name={args.run_name}, nested={self._nested_run}," - f" tags={self._nested_run}, tracking_uri={self._tracking_uri}" + f" tracking_uri={self._tracking_uri}" ) if state.is_world_process_zero: if not self._ml_flow.is_tracking_uri_set(): From fa8763ce172fff0ae38280bf8cd7428c4921e0ed Mon Sep 17 00:00:00 2001 From: UV Date: Mon, 9 Dec 2024 22:10:32 +0530 Subject: [PATCH 010/110] Fixed typo of 'avilable' in prompts.py (#35145) --- src/transformers/agents/prompts.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/agents/prompts.py b/src/transformers/agents/prompts.py index 898a7e011a2b05..0cf8beb144f8ba 100644 --- a/src/transformers/agents/prompts.py +++ b/src/transformers/agents/prompts.py @@ -395,7 +395,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"): SYSTEM_PROMPT_PLAN = """You are a world expert at making efficient plans to solve any task using a set of carefully crafted tools. Now for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts. -This plan should involve individual tasks based on the avilable tools, that if executed correctly will yield the correct answer. +This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer. Do not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS. After writing the final step of the plan, write the '\n' tag and stop there.""" @@ -466,7 +466,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"): ``` Now for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts. -This plan should involve individual tasks based on the avilable tools, that if executed correctly will yield the correct answer. +This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer. Beware that you have {remaining_steps} steps remaining. Do not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS. After writing the final step of the plan, write the '\n' tag and stop there. @@ -474,7 +474,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"): Now write your new plan below.""" SYSTEM_PROMPT_PLAN_STRUCTURED = """Output a step-by-step plan to solve the task using the given tools. -This plan should involve individual tasks based on the avilable tools, that if executed correctly will yield the correct answer. Each step should be structured as follows: +This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer. Each step should be structured as follows: Step #n: { "description": "tool": , @@ -620,7 +620,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"): After writing the final step of the plan, write the '\n' tag and stop there. Output the plan only and nothing else.""" SYSTEM_PROMPT_PLAN_UPDATE_STRUCTURED = """Output a step-by-step plan to solve the task using the given tools. -This plan should involve individual tasks based on the avilable tools, that if executed correctly will yield the correct answer. Each step should be structured as follows: +This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer. Each step should be structured as follows: Step #n: {{ "description": "tool": , From 34f4080ff59b1668d919a1ba9f8bc4a3a2a3f478 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 9 Dec 2024 13:55:16 -0500 Subject: [PATCH 011/110] [CI] Fix bnb quantization tests with accelerate>=1.2.0 (#35172) --- tests/quantization/bnb/test_4bit.py | 8 ++++---- tests/quantization/bnb/test_mixed_int8.py | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py index 3eae429abb206a..9512d0aa70af97 100644 --- a/tests/quantization/bnb/test_4bit.py +++ b/tests/quantization/bnb/test_4bit.py @@ -385,14 +385,14 @@ def test_inference_without_keep_in_fp32(self): # test with `google-t5/t5-small` model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto") - encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device) _ = model.generate(**encoded_input) # test with `flan-t5-small` model = T5ForConditionalGeneration.from_pretrained( self.dense_act_model_name, load_in_4bit=True, device_map="auto" ) - encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device) _ = model.generate(**encoded_input) T5ForConditionalGeneration._keep_in_fp32_modules = modules @@ -410,14 +410,14 @@ def test_inference_with_keep_in_fp32(self): # there was a bug with decoders - this test checks that it is fixed self.assertTrue(isinstance(model.decoder.block[0].layer[0].SelfAttention.q, bnb.nn.Linear4bit)) - encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device) _ = model.generate(**encoded_input) # test with `flan-t5-small` model = T5ForConditionalGeneration.from_pretrained( self.dense_act_model_name, load_in_4bit=True, device_map="auto" ) - encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device) _ = model.generate(**encoded_input) diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py index 567aa956271b70..158fdfaf71dc5c 100644 --- a/tests/quantization/bnb/test_mixed_int8.py +++ b/tests/quantization/bnb/test_mixed_int8.py @@ -514,14 +514,14 @@ def test_inference_without_keep_in_fp32(self): # test with `google-t5/t5-small` model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto") - encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device) _ = model.generate(**encoded_input) # test with `flan-t5-small` model = T5ForConditionalGeneration.from_pretrained( self.dense_act_model_name, load_in_8bit=True, device_map="auto" ) - encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device) _ = model.generate(**encoded_input) T5ForConditionalGeneration._keep_in_fp32_modules = modules @@ -540,14 +540,14 @@ def test_inference_with_keep_in_fp32(self): # there was a bug with decoders - this test checks that it is fixed self.assertTrue(isinstance(model.decoder.block[0].layer[0].SelfAttention.q, bnb.nn.Linear8bitLt)) - encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device) _ = model.generate(**encoded_input) # test with `flan-t5-small` model = T5ForConditionalGeneration.from_pretrained( self.dense_act_model_name, load_in_8bit=True, device_map="auto" ) - encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device) _ = model.generate(**encoded_input) def test_inference_with_keep_in_fp32_serialized(self): @@ -571,14 +571,14 @@ def test_inference_with_keep_in_fp32_serialized(self): # there was a bug with decoders - this test checks that it is fixed self.assertTrue(isinstance(model.decoder.block[0].layer[0].SelfAttention.q, bnb.nn.Linear8bitLt)) - encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device) _ = model.generate(**encoded_input) # test with `flan-t5-small` model = T5ForConditionalGeneration.from_pretrained( self.dense_act_model_name, load_in_8bit=True, device_map="auto" ) - encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device) _ = model.generate(**encoded_input) From dada0fd85f13c68dd7506aba0740083523bcf2df Mon Sep 17 00:00:00 2001 From: Spiros Dontas Date: Tue, 10 Dec 2024 09:40:40 +0200 Subject: [PATCH 012/110] Fix `num_items_in_batch` not being an integer (#35115) In method `Trainer#get_batch_samples`, the return values should be a list of batch samples and an integer indicating the number of items that exist in the batch. However, this was not actually a case and what was returned instead of an integer, was a tensor with one element. In the multi-GPU setup, this tensor is placed in a different device than the loss tensor, causing the loss function to raise a `RuntimeError`. The problem arises from https://github.com/huggingface/transformers/blob/5d7739f15a6e50de416977fe2cc9cb516d67edda/src/transformers/trainer.py#L5139-L5144, where the outer `sum` operates over a list of tensors which means that the final result is also a tensor. To counter this issue, a new check (after the accelerator gathering) has been added in order to convert a potential tensor to an integer before returning the `num_items_in_batch`. --- src/transformers/trainer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index f7d79481809807..be41a415e5a710 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -5138,4 +5138,8 @@ def get_batch_samples(self, epoch_iterator, num_batches): if self.args.average_tokens_across_devices: num_items_in_batch = self.accelerator.gather(num_items_in_batch).sum().item() + + if torch.is_tensor(num_items_in_batch): + num_items_in_batch = num_items_in_batch.item() + return batch_samples, num_items_in_batch From 0938b57770db818b3d818bc00fa989fa206087dd Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Tue, 10 Dec 2024 09:59:17 +0100 Subject: [PATCH 013/110] Assisted decoding multi-gpu (#35116) * fix * move a few lines up --- src/transformers/generation/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 89c57cb913fec2..fe634141eca09b 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -4260,9 +4260,10 @@ def _assisted_decoding( while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device): cur_len = input_ids.shape[-1] - # 1. Fetch candidate sequences from a `CandidateGenerator` + # 1. Fetch candidate sequences from a `CandidateGenerator` and move to the correct device candidate_input_ids, candidate_logits = candidate_generator.get_candidates(input_ids) + candidate_input_ids = candidate_input_ids.to(self.device) if candidate_logits is not None: candidate_logits = candidate_logits.to(self.device) From 80f2b1610fa17ebf582897c8611180cac38652f0 Mon Sep 17 00:00:00 2001 From: Trevor Royer Date: Tue, 10 Dec 2024 02:11:45 -0700 Subject: [PATCH 014/110] Fix file path for shard_num 1 with mllama converter (#35053) "#35049 fix path for num_shard 1" --- .../models/mllama/convert_mllama_weights_to_hf.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/mllama/convert_mllama_weights_to_hf.py b/src/transformers/models/mllama/convert_mllama_weights_to_hf.py index ca22d31ee3ca5e..b2c40e27bb2b40 100644 --- a/src/transformers/models/mllama/convert_mllama_weights_to_hf.py +++ b/src/transformers/models/mllama/convert_mllama_weights_to_hf.py @@ -338,7 +338,11 @@ def write_model( print(f"Fetching all parameters from the checkpoint at {input_base_path}...") if num_shards == 1: - loaded = [torch.load(os.path.join(input_base_path, "consolidated.pth"), map_location="cpu", mmap=True)] + if os.path.exists(os.path.join(input_base_path, "consolidated.00.pth")): + path = os.path.join(input_base_path, "consolidated.00.pth") + else: + path = os.path.join(input_base_path, "consolidated.pth") + loaded = [torch.load(path, map_location="cpu", mmap=True)] else: loaded = [ torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu", mmap=True) From 6acb4e43a7a0c89830bc3658cb425477e6934be5 Mon Sep 17 00:00:00 2001 From: Gallil Maimon <62688880+gallilmaimon@users.noreply.github.com> Date: Tue, 10 Dec 2024 15:18:23 +0200 Subject: [PATCH 015/110] Support BatchNorm in Hubert pos_conv_emb as in fairseq (#34389) * Support BatchNorm in Hubert pos_conv_emb as in fairseq * Correct the new defaults (#34377) * Correct the new defaults * CIs * add check * Update utils.py * Update utils.py * Add the max_length in generate test checking shape without passing length * style * CIs * fix fx CI issue * [auto. ping] Avoid sending empty info + add more team members (#34383) * update * update --------- Co-authored-by: ydshieh * Fix glm (#34388) * Fix duplicated * fix import * Use non nested images and batched text Idefics2/3 (#34222) * add support for non nested images and add tests * add tests error scenario * fix style * added single and no image to error tests * Fix onnx non-expotable inplace aten op (#34376) * fix onnx non-expotable inplace op * mistral, qwen2, qwen2_vl, starcoder2 * fixup copies * Fix right padding in LLaVA models (#34305) * fix right pad llavas * device mismatch * no filter (#34391) * no filter * no filter * no filter --------- Co-authored-by: ydshieh * SynthID: better example (#34372) * better example * Update src/transformers/generation/configuration_utils.py * Update src/transformers/generation/logits_process.py * nits * Tests: upgrade `test_eager_matches_sdpa_generate` (#34386) * Fix bnb training test failure (#34414) * Fix bnb training test: compatibility with OPTSdpaAttention * Avoid check expected exception when it is on CUDA (#34408) * update * update --------- Co-authored-by: ydshieh * Fix typos in agents_advanced.md (#34405) * [docs] Cache implementations (#34325) cache * [run-slow] hubert * Support BatchNorm in Hubert pos_conv_emb as in fairseq Add conversion integration test, and make batchnorm explicit variable * Support BatchNorm in Hubert pos_conv_emb as in fairseq fix make fixup styling changes * [run-slow] hubert * Support BatchNorm in Hubert pos_conv_emb as in fairseq * [run-slow] hubert * Support BatchNorm in Hubert pos_conv_emb as in fairseq Add conversion integration test, and make batchnorm explicit variable * Support BatchNorm in Hubert pos_conv_emb as in fairseq fix make fixup styling changes * [run-slow] hubert * [run-slow] hubert --------- Co-authored-by: Cyril Vallez Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Co-authored-by: ydshieh Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Co-authored-by: Raushan Turganbay Co-authored-by: Joao Gante Co-authored-by: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Co-authored-by: Rudy Delouya Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> Co-authored-by: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com> --- .../models/hubert/configuration_hubert.py | 4 ++ ..._original_pytorch_checkpoint_to_pytorch.py | 15 ++++++- .../models/hubert/modeling_hubert.py | 40 ++++++++++--------- tests/models/hubert/test_modeling_hubert.py | 37 +++++++++++++++++ 4 files changed, 77 insertions(+), 19 deletions(-) diff --git a/src/transformers/models/hubert/configuration_hubert.py b/src/transformers/models/hubert/configuration_hubert.py index 20977cff87d167..9f488b19888957 100644 --- a/src/transformers/models/hubert/configuration_hubert.py +++ b/src/transformers/models/hubert/configuration_hubert.py @@ -94,6 +94,8 @@ class HubertConfig(PretrainedConfig): embeddings layer. num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16): Number of groups of 1D convolutional positional embeddings layer. + conv_pos_batch_norm (`bool`, *optional*, defaults to `False`): + Whether to use batch norm instead of weight norm in conv_pos do_stable_layer_norm (`bool`, *optional*, defaults to `False`): Whether do apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is @@ -182,6 +184,7 @@ def __init__( conv_bias=False, num_conv_pos_embeddings=128, num_conv_pos_embedding_groups=16, + conv_pos_batch_norm=False, do_stable_layer_norm=False, apply_spec_augment=True, mask_time_prob=0.05, @@ -209,6 +212,7 @@ def __init__( self.conv_bias = conv_bias self.num_conv_pos_embeddings = num_conv_pos_embeddings self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups + self.conv_pos_batch_norm = conv_pos_batch_norm self.num_feat_extract_layers = len(self.conv_dim) self.num_hidden_layers = num_hidden_layers self.intermediate_size = intermediate_size diff --git a/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py index 6478fdadf13de3..4966340493f35c 100644 --- a/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py @@ -38,7 +38,8 @@ MAPPING = { "post_extract_proj": "feature_projection.projection", - "encoder.pos_conv.0": "encoder.pos_conv_embed.conv", + "encoder.pos_conv.0": "encoder.pos_conv_embed.batch_norm", + "encoder.pos_conv.1": "encoder.pos_conv_embed.conv", "self_attn.k_proj": "encoder.layers.*.attention.k_proj", "self_attn.v_proj": "encoder.layers.*.attention.v_proj", "self_attn.q_proj": "encoder.layers.*.attention.q_proj", @@ -76,6 +77,12 @@ def set_recursively(hf_pointer, key, value, full_name, weight_type): hf_pointer.weight_v.data = value elif weight_type == "bias": hf_pointer.bias.data = value + elif weight_type == "running_mean": + hf_pointer.running_mean.data = value + elif weight_type == "running_var": + hf_pointer.running_var.data = value + elif weight_type == "num_batches_tracked": + hf_pointer.num_batches_tracked.data = value else: hf_pointer.data = value @@ -116,6 +123,12 @@ def recursively_load_weights(fairseq_model, hf_model, is_finetuned): weight_type = "weight" elif "bias" in name: weight_type = "bias" + elif "running_mean" in name: + weight_type = "running_mean" + elif "running_var" in name: + weight_type = "running_var" + elif "num_batches_tracked" in name: + weight_type = "num_batches_tracked" else: weight_type = None set_recursively(hf_model, mapped_key, value, name, weight_type) diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index 57f59cf9aab94f..03904a6abfa08b 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -260,7 +260,6 @@ def forward(self, hidden_states): return hidden_states -# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PositionalConvEmbedding with Wav2Vec2->Hubert class HubertPositionalConvEmbedding(nn.Module): def __init__(self, config): super().__init__() @@ -272,32 +271,37 @@ def __init__(self, config): groups=config.num_conv_pos_embedding_groups, ) - weight_norm = nn.utils.weight_norm - if hasattr(nn.utils.parametrizations, "weight_norm"): - weight_norm = nn.utils.parametrizations.weight_norm + self.batch_norm = None + if config.conv_pos_batch_norm: + self.batch_norm = nn.BatchNorm1d(config.hidden_size) + else: + weight_norm = nn.utils.weight_norm + if hasattr(nn.utils.parametrizations, "weight_norm"): + weight_norm = nn.utils.parametrizations.weight_norm - if is_deepspeed_zero3_enabled(): - import deepspeed + if is_deepspeed_zero3_enabled(): + import deepspeed - with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0): - self.conv = weight_norm(self.conv, name="weight", dim=2) - if hasattr(self.conv, "parametrizations"): - weight_g = self.conv.parametrizations.weight.original0 - weight_v = self.conv.parametrizations.weight.original1 + with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0): + self.conv = weight_norm(self.conv, name="weight", dim=2) + if hasattr(self.conv, "parametrizations"): + weight_g = self.conv.parametrizations.weight.original0 + weight_v = self.conv.parametrizations.weight.original1 + else: + weight_g = self.conv.weight_g + weight_v = self.conv.weight_v + deepspeed.zero.register_external_parameter(self, weight_v) + deepspeed.zero.register_external_parameter(self, weight_g) else: - weight_g = self.conv.weight_g - weight_v = self.conv.weight_v - deepspeed.zero.register_external_parameter(self, weight_v) - deepspeed.zero.register_external_parameter(self, weight_g) - else: - self.conv = weight_norm(self.conv, name="weight", dim=2) + self.conv = weight_norm(self.conv, name="weight", dim=2) self.padding = HubertSamePadLayer(config.num_conv_pos_embeddings) self.activation = ACT2FN[config.feat_extract_activation] def forward(self, hidden_states): hidden_states = hidden_states.transpose(1, 2) - + if self.batch_norm is not None: + hidden_states = self.batch_norm(hidden_states) hidden_states = self.conv(hidden_states) hidden_states = self.padding(hidden_states) hidden_states = self.activation(hidden_states) diff --git a/tests/models/hubert/test_modeling_hubert.py b/tests/models/hubert/test_modeling_hubert.py index 86f2b4119324ae..191d2f8c88c380 100644 --- a/tests/models/hubert/test_modeling_hubert.py +++ b/tests/models/hubert/test_modeling_hubert.py @@ -943,3 +943,40 @@ def test_inference_distilhubert(self): self.assertTrue(torch.allclose(outputs[:, :4, :4], expected_outputs_first, atol=5e-3)) self.assertTrue(torch.allclose(outputs[:, -4:, -4:], expected_outputs_last, atol=5e-3)) self.assertTrue(abs(outputs.sum() - expected_output_sum) < 0.1) + + def test_inference_hubert_25hz(self): + model = HubertModel.from_pretrained("slprl/mhubert-base-25hz").to(torch_device) + + sample = self._load_datasamples(1) + input_speech = torch.tensor(sample[0], dtype=torch.float, device=torch_device).unsqueeze(0) + + with torch.no_grad(): + outputs = model(input_speech, output_hidden_states=True).hidden_states[11] + + # expected outputs taken from the original textlesslib implementation by: + # model = SpeechEncoder.by_name(dense_model_name='mhubert-base-25hz', quantizer_model_name='kmeans', + # vocab_size=500, deduplicate=False, need_f0=False) + # model(wav)['dense'] + expected_outputs_first = torch.tensor( + [ + [0.0267, 0.1776, -0.1706, -0.4559], + [-0.2430, -0.2943, -0.1864, -0.1187], + [-0.1812, -0.4239, -0.1916, -0.0858], + [-0.1495, -0.4758, -0.4036, 0.0302], + ], + device=torch_device, + ) + expected_outputs_last = torch.tensor( + [ + [0.3366, -0.2734, -0.1415, -0.3055], + [0.2329, -0.3580, -0.1421, -0.3197], + [0.1631, -0.4301, -0.1965, -0.2956], + [0.3342, -0.2185, -0.2253, -0.2363], + ], + device=torch_device, + ) + expected_output_sum = 1681.7603 + + self.assertTrue(torch.allclose(outputs[:, :4, :4], expected_outputs_first, atol=5e-3)) + self.assertTrue(torch.allclose(outputs[:, -4:, -4:], expected_outputs_last, atol=5e-3)) + self.assertTrue(abs(outputs.sum() - expected_output_sum) < 0.1) From 5fba3f99c0a44f7613aeaa0550f7786919780663 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= Date: Tue, 10 Dec 2024 14:52:20 +0100 Subject: [PATCH 016/110] Remove unnecessary masked_fill in deberta models (#35182) --- src/transformers/models/deberta/modeling_deberta.py | 1 - src/transformers/models/deberta_v2/modeling_deberta_v2.py | 1 - 2 files changed, 2 deletions(-) diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py index 6993121b6c1ebe..c9a85bcad1bd6f 100644 --- a/src/transformers/models/deberta/modeling_deberta.py +++ b/src/transformers/models/deberta/modeling_deberta.py @@ -290,7 +290,6 @@ def forward( attention_scores = attention_scores.masked_fill(~(attention_mask), torch.finfo(query_layer.dtype).min) # bsz x height x length x dimension attention_probs = nn.functional.softmax(attention_scores, dim=-1) - attention_probs.masked_fill(attention_mask, 0) attention_probs = self.dropout(attention_probs) if self.head_weights_proj is not None: diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index 6645c1de832e12..7d2f25603a6f96 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -267,7 +267,6 @@ def forward( attention_scores = attention_scores.masked_fill(~(attention_mask), torch.finfo(query_layer.dtype).min) # bsz x height x length x dimension attention_probs = nn.functional.softmax(attention_scores, dim=-1) - attention_probs.masked_fill(attention_mask, 0) attention_probs = self.dropout(attention_probs) context_layer = torch.bmm( From 3e2769a3c998599a9b84b80d9ef8c81b29476276 Mon Sep 17 00:00:00 2001 From: "Huang, Guangtai" Date: Tue, 10 Dec 2024 06:31:22 -0800 Subject: [PATCH 017/110] Fix DBRX LayerNorm init method (#35177) fix dbrx layernorm init --- src/transformers/models/dbrx/modeling_dbrx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py index 659fa154ecf776..7d20b766658f23 100644 --- a/src/transformers/models/dbrx/modeling_dbrx.py +++ b/src/transformers/models/dbrx/modeling_dbrx.py @@ -845,7 +845,7 @@ def _init_weights(self, module: nn.Module): if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): - module.weight.data.normal_(mean=0.0, std=std) + module.weight.data.fill_(1.0) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, DbrxExpertGLU): From e5c45a667978ecdcfc9d58ae1dae4ad67401ab39 Mon Sep 17 00:00:00 2001 From: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com> Date: Tue, 10 Dec 2024 16:30:09 +0100 Subject: [PATCH 018/110] Fixing GGUF support for StableLm (#35060) fix Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --- src/transformers/modeling_gguf_pytorch_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py index 7562649be753bb..00c080fbea81c7 100644 --- a/src/transformers/modeling_gguf_pytorch_utils.py +++ b/src/transformers/modeling_gguf_pytorch_utils.py @@ -307,7 +307,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): ffn_norm_name = "ffn_norm" qkv_bias = any(bias_name in tensor.name for tensor in reader.tensors for bias_name in attn_bias_name) use_parallel_residual = any(ffn_norm_name in tensor.name for tensor in reader.tensors) - parsed_parameters["config"]["qkv_bias"] = qkv_bias + parsed_parameters["config"]["use_qkv_bias"] = qkv_bias parsed_parameters["config"]["use_parallel_residual"] = not use_parallel_residual model_size = "" From 425af6cdc20d20e93fb814e6932c1b194b3f2915 Mon Sep 17 00:00:00 2001 From: Ahmed Almaghz <53489256+AhmedAlmaghz@users.noreply.github.com> Date: Tue, 10 Dec 2024 20:08:27 +0300 Subject: [PATCH 019/110] [i18n-ar] Translated file : `docs/source/ar/community.md` into Arabic (#33027) * Add docs/source/ar/community.md to Add_docs_source_ar_community.md * Update community.md * Update community.md * Update community.md * Update _toctree.yml - add community.md * Update docs/source/ar/community.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Create how_to_hack_models.md * Create modular_transformers.md * Create tiktoken.md * Update _toctree.yml * Update docs/source/ar/how_to_hack_models.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/how_to_hack_models.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/how_to_hack_models.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/how_to_hack_models.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/how_to_hack_models.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/how_to_hack_models.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/how_to_hack_models.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/how_to_hack_models.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/modular_transformers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/modular_transformers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/modular_transformers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/modular_transformers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/modular_transformers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/modular_transformers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/modular_transformers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/modular_transformers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/modular_transformers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/tiktoken.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/tiktoken.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> --------- Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> --- docs/source/ar/_toctree.yml | 10 +- docs/source/ar/community.md | 66 +++++++++ docs/source/ar/how_to_hack_models.md | 163 ++++++++++++++++++++++ docs/source/ar/modular_transformers.md | 184 +++++++++++++++++++++++++ docs/source/ar/tiktoken.md | 41 ++++++ 5 files changed, 462 insertions(+), 2 deletions(-) create mode 100644 docs/source/ar/community.md create mode 100644 docs/source/ar/how_to_hack_models.md create mode 100644 docs/source/ar/modular_transformers.md create mode 100644 docs/source/ar/tiktoken.md diff --git a/docs/source/ar/_toctree.yml b/docs/source/ar/_toctree.yml index 1208153c22df68..138d3a1bd8aa08 100644 --- a/docs/source/ar/_toctree.yml +++ b/docs/source/ar/_toctree.yml @@ -133,12 +133,18 @@ title: المعايير - local: notebooks title: دفاتر الملاحظات مع الأمثلة -# - local: community -# title: موارد المجتمع + - local: community + title: موارد المجتمع - local: troubleshooting title: استكشاف الأخطاء وإصلاحها - local: gguf title: التوافق مع ملفات GGUF + - local: tiktoken + title: التوافق مع ملفات TikToken + - local: modular_transformers + title: الوحدات النمطية في `transformers` + - local: how_to_hack_models + title: اختراق النموذج (الكتابة فوق فئة لاستخدامك) title: أدلة المطورين # - sections: # - local: quantization/overview diff --git a/docs/source/ar/community.md b/docs/source/ar/community.md new file mode 100644 index 00000000000000..5a1c31de0aaa3f --- /dev/null +++ b/docs/source/ar/community.md @@ -0,0 +1,66 @@ +# مجتمع المطورين + +هذه الصفحة تجمع الموارد حول 🤗 Transformers التي طورها المجتمع. + +## موارد المجتمع: + +| المصدر | الوصف | المؤلف | +|:----------|:-------------|------:| +| [Hugging Face Transformers Glossary Flashcards](https://www.darigovresearch.com/huggingface-transformers-glossary-flashcards) | مجموعة من البطاقات التعليمية القائمة على [Transformers Docs Glossary](glossary) والتي تم وضعها في شكل يمكن تعلمه/مراجعته بسهولة باستخدام [Anki](https://apps.ankiweb.net/) وهو تطبيق مفتوح المصدر متعدد المنصات مصمم خصيصًا للاحتفاظ بالمعرفة على المدى الطويل. شاهد هذا [فيديو تمهيدي حول كيفية استخدام البطاقات التعليمية](https://www.youtube.com/watch?v=Dji_7PILrw). | [Darigov Research](https://www.darigovresearch.com/) | + +## دفاتر ملاحظات المجتمع: + +| الدفتر | الوصف | المؤلف | | +|:----------|:-------------|:-------------|------:| +| [Fine-tune a pre-trained Transformer to generate lyrics](https://github.com/AlekseyKorshuk/huggingartists) | كيفية توليد كلمات الأغاني على غرار فنانك المفضل من خلال ضبط نموذج GPT-2 | [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) | +| [Train T5 in Tensorflow 2](https://github.com/snapthat/TF-T5-text-to-text) | كيفية تدريب T5 لأي مهمة باستخدام Tensorflow 2. يوضح هذا الدفتر مهمة السؤال والجواب المنفذة في Tensorflow 2 باستخدام SQUAD | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) | +| [Train T5 on TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb) | كيفية تدريب T5 على SQUAD مع Transformers و Nlp | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) | +| [Fine-tune T5 for Classification and Multiple Choice](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | كيفية ضبط نموذج T5 للتصنيف والمهام متعددة الخيارات باستخدام تنسيق النص إلى نص مع PyTorch Lightning | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | +| [Fine-tune DialoGPT on New Datasets and Languages](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | كيفية ضبط نموذج DialoGPT على مجموعة بيانات جديدة لروبوتات الدردشة المحادثية المفتوحة | [Nathan Cooper](https://github.com/ncoop57) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | +| [Long Sequence Modeling with Reformer](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) | كيفية التدريب على تسلسلات طويلة تصل إلى 500,000 رمز باستخدام Reformer | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) | +| [Fine-tune BART for Summarization](https://github.com/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) | كيفية ضبط نموذج BART للتلخيص باستخدام fastai باستخدام blurr | [Wayde Gilliam](https://ohmeow.com/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) | +| [Fine-tune a pre-trained Transformer on anyone's tweets](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | كيفية توليد تغريدات على غرار حساب Twitter المفضل لديك من خلال ضبط نموذج GPT-2 | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | +| [Optimize 🤗 Hugging Face models with Weights & Biases](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | دليل كامل لعرض تكامل W&B مع Hugging Face | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | +| [Pretrain Longformer](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) | كيفية بناء نسخة "طويلة" من النماذج المسبقة التدريب الموجودة | [Iz Beltagy](https://beltagy.net) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) | +| [Fine-tune Longformer for QA](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | كيفية ضبط نموذج Longformer لمهمة QA | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | +| [Evaluate Model with 🤗nlp](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | كيفية تقييم نموذج Longformer على TriviaQA مع `nlp` | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) | +| [Fine-tune T5 for Sentiment Span Extraction](https://github.com/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) | كيفية ضبط نموذج T5 لاستخراج المشاعر باستخدام تنسيق النص إلى نص مع PyTorch Lightning | [Lorenzo Ampil](https://github.com/enzoampil) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) | +| [Fine-tune DistilBert for Multiclass Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb) | كيفية ضبط نموذج DistilBert للتصنيف متعدد الفئات باستخدام PyTorch | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb)| +|[Fine-tune BERT for Multi-label Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|كيفية ضبط نموذج BERT للتصنيف متعدد التصنيفات باستخدام PyTorch|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)| +|[Fine-tune T5 for Summarization](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|كيفية ضبط نموذج T5 للتلخيص في PyTorch وتتبع التجارب باستخدام WandB|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)| +|[Speed up Fine-Tuning in Transformers with Dynamic Padding / Bucketing](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)|كيفية تسريع الضبط الدقيق بعامل 2 باستخدام الضبط الديناميكي/التقسيم|[Michael Benesty](https://github.com/pommedeterresautee) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)| +|[Pretrain Reformer for Masked Language Modeling](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| كيفية تدريب نموذج Reformer مع طبقات الانتباه ثنائية الاتجاه | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)| +|[Expand and Fine Tune Sci-BERT](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| كيفية زيادة مفردات نموذج SciBERT المسبق التدريب من AllenAI على مجموعة بيانات CORD وإنشاء خط أنابيب لها. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)| +|[Fine Tune BlenderBotSmall for Summarization using the Trainer API](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/fine-tune-blenderbot_small-for-summarization.ipynb)| كيفية ضبط نموذج BlenderBotSmall للتلخيص على مجموعة بيانات مخصصة، باستخدام واجهة برمجة التطبيقات Trainer. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Wmupuls7mykSGyRN_Qo6lPQhgp56ymq?usp=sharing)| +|[Fine-tune Electra and interpret with Integrated Gradients](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | كيفية ضبط نموذج Electra للتحليل العاطفي وتفسير التنبؤات باستخدام Captum Integrated Gradients | [Eliza Szczechla](https://elsanns.github.io) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)| +|[fine-tune a non-English GPT-2 Model with Trainer class](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | كيفية ضبط نموذج GPT-2 غير الإنجليزي باستخدام فئة Trainer | [Philipp Schmid](https://www.philschmid.de) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)| +|[Fine-tune a DistilBERT Model for Multi Label Classification task](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | كيفية ضبط نموذج DistilBERT لمهمة التصنيف متعدد التصنيفات | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)| +|[Fine-tune ALBERT for sentence-pair classification](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | كيفية ضبط نموذج ALBERT أو أي نموذج آخر قائم على BERT لمهمة التصنيف المزدوج للجمل | [Nadir El Manouzi](https://github.com/NadirEM) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)| +|[Fine-tune Roberta for sentiment analysis](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | كيفية ضبط نموذج Roberta للتحليل العاطفي | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)| +|[Evaluating Question Generation Models](https://github.com/flexudy-pipe/qugeev) | ما مدى دقة الإجابات على الأسئلة التي يولدها نموذجك التحويلي seq2seq؟ | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)| +|[Classify text with DistilBERT and Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | كيفية ضبط نموذج DistilBERT للتصنيف النصي في TensorFlow | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)| +|[Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | كيفية البدء السريع لنموذج *EncoderDecoderModel* مع نقطة تفتيش *google-bert/bert-base-uncased* للتلخيص على CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)| +|[Leverage RoBERTa for Encoder-Decoder Summarization on BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | كيفية البدء السريع لنموذج *EncoderDecoderModel* المشترك مع نقطة تفتيش *FacebookAI/roberta-base* للتلخيص على BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)| +|[Fine-tune TAPAS on Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | كيفية ضبط نموذج *TapasForQuestionAnswering* مع نقطة تفتيش *tapas-base* على مجموعة بيانات Sequential Question Answering (SQA) | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)| +|[Evaluate TAPAS on Table Fact Checking (TabFact)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | كيفية تقييم نموذج *TapasForSequenceClassification* المضبوط مسبقًا مع نقطة تفتيش *tapas-base-finetuned-tabfact* باستخدام مزيج من مكتبتي 🤗 datasets و 🤗 transformers | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)| +|[Fine-tuning mBART for translation](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | كيفية ضبط نموذج mBART باستخدام Seq2SeqTrainer للترجمة من الهندية إلى الإنجليزية | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)| +|[Fine-tune LayoutLM on FUNSD (a form understanding dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb) | كيفية ضبط نموذج *LayoutLMForTokenClassification* على مجموعة بيانات FUNSD لاستخراج المعلومات من المستندات الممسوحة ضوئيًا | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb)| +|[Fine-Tune DistilGPT2 and Generate Text](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb) | كيفية ضبط نموذج DistilGPT2 وتوليد النص | [Aakash Tripathi](https://github.com/tripathiaakash) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb)| +|[Fine-Tune LED on up to 8K tokens](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | كيفية ضبط نموذج LED على pubmed للتلخيص طويل المدى | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)| +|[Evaluate LED on Arxiv](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | كيفية تقييم نموذج LED للتلخيص طويل المدى بشكل فعال | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)| +|[Fine-tune LayoutLM on RVL-CDIP (a document image classification dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | كيفية ضبط نموذج *LayoutLMForSequenceClassification* على مجموعة بيانات RVL-CDIP لتصنيف المستندات الممسوحة ضوئيًا | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)| +|[Wav2Vec2 CTC decoding with GPT2 adjustment](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | كيفية فك تشفير تسلسل CTC مع تعديل نموذج اللغة | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_zQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)| +|[Fine-tune BART for summarization in two languages with Trainer class](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | كيفية ضبط نموذج BART للتلخيص بلغتين باستخدام فئة Trainer | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)| +|[Evaluate Big Bird on Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | كيفية تقييم نموذج BigBird للأسئلة والأجوبة على وثائق طويلة على Trivia QA | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)| +| [Create video captions using Wav2Vec2](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | كيفية إنشاء تعليقات توضيحية على YouTube من أي فيديو من خلال تفريغ الصوت باستخدام Wav2Vec | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | +| [Fine-tune the Vision Transformer on CIFAR-10 using PyTorch Lightning](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | كيفية ضبط نموذج Vision Transformer (ViT) على CIFAR-10 باستخدام مكتبات HuggingFace Transformers و Datasets و PyTorch Lightning | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | +| [Fine-tune the Vision Transformer on CIFAR-10 using the 🤗 Trainer](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | كيفية ضبط نموذج Vision Transformer (ViT) على CIFAR-10 باستخدام مكتبات HuggingFace Transformers و Datasets و 🤗 Trainer | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | +| [Evaluate LUKE on Open Entity, an entity typing dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | كيفية تقييم نموذج *LukeForEntityClassification* على مجموعة بيانات Open Entity | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | +| [Evaluate LUKE on TACRED, a relation extraction dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | كيفية تقييم نموذج *LukeForEntityPairClassification* على مجموعة بيانات TACRED | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | +| [Evaluate LUKE on CoNLL-2003, an important NER benchmark](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | كيفية تقييم نموذج *LukeForEntitySpanClassification* على مجموعة بيانات CoNLL-2003 | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | +| [Evaluate BigBird-Pegasus on PubMed dataset](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | كيفية تقييم نموذج *BigBirdPegasusForConditionalGeneration* على مجموعة بيانات PubMed | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | +| [Speech Emotion Classification with Wav2Vec2](https://github.com/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | كيفية استخدام نموذج Wav2Vec2 المسبق التدريب لتصنيف المشاعر على مجموعة بيانات MEGA | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | +| [Detect objects in an image with DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | كيفية استخدام نموذج *DetrForObjectDetection* المدرب للكشف عن الأجسام في صورة وتصوير الانتباه | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | +| [Fine-tune DETR on a custom object detection dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | كيفية ضبط نموذج *DetrForObjectDetection* على مجموعة بيانات الكشف عن الأجسام المخصصة | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | +| [Finetune T5 for Named Entity Recognition](https://github.com/ToluClassics/Notebooks/blob/main/T5_Ner_Finetuning.ipynb) | كيفية ضبط نموذج *T5* على مهمة التعرف على الكيانات المسماة | [Ogundepo Odunayo](https://github.com/ToluClassics) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing) | +| [Fine-Tuning Open-Source LLM using QLoRA with MLflow and PEFT](https://github.com/mlflow/mlflow/blob/master/docs/source/llms/transformers/tutorials/fine-tuning/transformers-peft.ipynb) | كيفية استخدام [QLoRA](https://github.com/artidoro/qlora) و [PEFT](https://huggingface.co/docs/peft/en/index) لضبط نموذج LLM بطريقة فعالة من حيث الذاكرة، مع استخدام [MLflow](https://mlflow.org/docs/latest/llms/transformers/index.html) لإدارة تتبع التجارب | [Yuki Watanabe](https://github.com/B-Step62) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mlflow/mlflow/blob/master/docs/source/llms/transformers/tutorials/fine-tuning/transformers-peft.ipynb) | diff --git a/docs/source/ar/how_to_hack_models.md b/docs/source/ar/how_to_hack_models.md new file mode 100644 index 00000000000000..8ce3589732f06a --- /dev/null +++ b/docs/source/ar/how_to_hack_models.md @@ -0,0 +1,163 @@ +# كيفية تعديل أي نموذج من نماذج Transformers + +توفر مكتبة [🤗 Transformers](https://github.com/huggingface/transformers) مجموعة من النماذج المسبقة التدريب والأدوات لمعالجة اللغات الطبيعية، والرؤية، وما إلى ذلك. على الرغم من أن هذه النماذج تغطي مجموعة واسعة من التطبيقات، فقد تواجه حالات استخدام لا تدعمها المكتبة بشكل افتراضي. يُمكن للتخصيص أن يفتح إمكانيات جديدة، مثل إضافة طبقات جديدة، أو تعديل البنية المعمارية، أو تحسين آليات الانتباه. سيُوضح لك هذا الدليل كيفية تعديل نماذج Transformers الموجودة لتلبية احتياجاتك المحددة. الشيء الرائع هو أنك لست بحاجة إلى الخروج من إطار عمل Transformers لإجراء هذه التغييرات. ي يمكنك تعديل النماذج مباشرةً في Transformers والاستفادة من الميزات مثل [واجهة برمجة التطبيقات Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer)، و [PreTrainedModel](https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel)، والضبط الدقيق الفعال باستخدام أدوات مثل [PEFT](https://huggingface.co/docs/peft/index). + +سنرشدك في هذا الدليل لكيفية تخصيص نماذج Transformers الموجودة لتلبية متطلباتك، دون فقدان مزايا الإطار. ستتعلم كيفية: + +- تعديل بنية نموذج ما من خلال تغيير آلية الانتباه الخاصة به. +- تطبيق تقنيات مثل Low-Rank Adaptation (LoRA) على مكونات نموذج محددة. + +نحن نشجعك على المساهمة باختراقاتك الخاصة ومشاركتها هنا مع المجتمع1 + +## مثال: تعديل آلية الانتباه في نموذج Segment Anything (SAM) + +نموذج **Segment Anything (SAM)** هو نموذج رائد في مجال تجزئة الصور. في تنفيذه الافتراضي، يستخدم SAM إسقاطًا مجمعًا للاستعلام والمفتاح والقيمة (`qkv`) في آلية الانتباه الخاصة به. ومع ذلك، قد ترغب في ضبط مكونات محددة فقط من آلية الانتباه، مثل إسقاطات الاستعلام (`q`) والقيمة (`v`)، لتقليل عدد المعلمات القابلة للتدريب والموارد الحسابية المطلوبة. + +### الدافع + +من خلال تقسيم الإسقاط المجمع `qkv` إلى إسقاطات منفصلة `q` و `k` و `v`، يمكنك تطبيق تقنيات مثل **LoRA** (Low-Rank Adaptation) على إسقاطي `q` و `v` فقط. يسمح لك هذا بما يلي: + +- ضبط عدد أقل من المعلمات، مما يقلل من العبء الحسابي. +- تحقيق أداء أفضل من خلال التركيز على مكونات محددة. +- تجربة استراتيجيات تعديل مختلفة في آلية الانتباه. + +### التنفيذ + +#### **الخطوة 1: إنشاء فئة اهتمام مخصصة** + +بعد ذلك، قم بإنشاء فئة فرعية من فئة `SamVisionAttention` الأصلية وعدلها لتضم إسقاطات `q` و `k` و `v` منفصلة. + +```python +import torch +import torch.nn as nn +from transformers.models.sam.modeling_sam import SamVisionAttention + +class SamVisionAttentionSplit(SamVisionAttention, nn.Module): + def __init__(self, config, window_size): + super().__init__(config, window_size) + del self.qkv + # إسقاطات منفصلة q و k و v + self.q = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias) + self.k = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias) + self.v = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias) + self._register_load_state_dict_pre_hook(self.split_q_k_v_load_hook) + + def split_q_k_v_load_hook(self, state_dict, prefix, *args): + keys_to_delete = [] + for key in list(state_dict.keys()): + if "qkv." in key: + # تقسيم q و k و v من الإسقاط المجمع + q, k, v = state_dict[key].chunk(3, dim=0) + # استبدال الإسقاطات الفردية q و k و v + state_dict[key.replace("qkv.", "q.")] = q + state_dict[key.replace("qkv.", "k.")] = k + state_dict[key.replace("qkv.", "v.")] = v + # وضع علامة على مفتاح qkv القديم للحذف + keys_to_delete.append(key) + + # حذف مفاتيح qkv القديمة + for key in keys_to_delete: + del state_dict[key] + + def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch.Tensor: + batch_size, height, width, _ = hidden_states.shape + qkv_shapes = (batch_size * self.num_attention_heads, height * width, -1) + query = self.q(hidden_states).reshape((batch_size, height * width,self.num_attention_heads, -1)).permute(0,2,1,3).reshape(qkv_shapes) + key = self.k(hidden_states).reshape((batch_size, height * width,self.num_attention_heads, -1)).permute(0,2,1,3).reshape(qkv_shapes) + value = self.v(hidden_states).reshape((batch_size, height * width,self.num_attention_heads, -1)).permute(0,2,1,3).reshape(qkv_shapes) + + attn_weights = (query * self.scale) @ key.transpose(-2, -1) + + if self.use_rel_pos: + attn_weights = self.add_decomposed_rel_pos( + attn_weights, query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width) + ) + + attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype) + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + attn_output = (attn_probs @ value).reshape(batch_size, self.num_attention_heads, height, width, -1) + attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, height, width, -1) + attn_output = self.proj(attn_output) + + if output_attentions: + outputs = (attn_output, attn_weights) + else: + outputs = (attn_output, None) + return outputs +``` + +**الشرح:** + +- **الإسقاطات المنفصلة:** يتم إزالة الإسقاط المُجمع `qkv`، وإنشاء إسقاطات خطية منفصلة `q` و `k` و `v`. +- **دالة استدعاء تحميل الأوزان:** تقوم طريقة `_split_qkv_load_hook` بتقسيم أوزان `qkv` المسبقة التدريب إلى أوزان `q` و `k` و `v` منفصلة عند تحميل النموذج. يضمن هذا التوافق مع أي نموذج مسبق التدريب. +- **التنفيذ الأمامي:** يتم حساب الاستعلامات والمفاتيح والقيم بشكل منفصل، وتستمر آلية الانتباه كالمعتاد. + +#### **الخطوة 2: استبدال فئة الانتباه الأصلية** + +استبدل فئة `SamVisionAttention` الأصلية بفئتك المخصصة بحيث يستخدم النموذج آلية الانتباه المعدلة. + +```python +from transformers import SamModel +from transformers.models.sam import modeling_sam + +# استبدال فئة الاهتمام في وحدة نمطية modeling_sam +modeling_sam.SamVisionAttention = SamVisionAttentionSplit + +# تحميل نموذج SAM المسبق التدريب +model = SamModel.from_pretrained("facebook/sam-vit-base") +``` + +**الشرح:** + +- **استبدال الفئة:** من خلال تعيين فئتك المخصصة إلى `modeling_sam.SamVisionAttention`، فإن أي حالات من فئة `SamVisionAttention` في النموذج ستستخدم النسخة المعدلة. وبالتالي، عند استدعاء `SamModel`، سيتم استخدام `SamVisionAttentionSplit` المحددة حديثًا. +- **تحميل النموذج:** يتم تحميل النموذج باستخدام `from_pretrained`، ويتم دمج آلية الانتباه المخصصة. + +#### **الخطوة 3: تطبيق LoRA على إسقاطات محددة** + +مع وجود إسقاطات `q` و `k` و `v` منفصلة، يمكنك الآن تطبيق LoRA على مكونات محددة، مثل إسقاطات `q` و `v`. + +```python +from peft import LoraConfig, get_peft_model + +config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q", "v"], # تطبيق LoRA على إسقاطات q و v + lora_dropout=0.1, + task_type="mask-generation" +) + +# تطبيق LoRA على النموذج +model = get_peft_model(model, config) +``` + +**الشرح:** + +- **تكوين LoRA:** تحدد `LoraConfig` المرتبة `r`، وعامل القياس `lora_alpha`، والوحدات المستهدفة (`"q"` و `"v"`)، ومعدل التخلي، ونوع المهمة. +- **تطبيق LoRA:** تقوم دالة `get_peft_model` بتطبيق LoRA على الوحدات المحددة في النموذج. +- **تقليل المعلمات:** من خلال التركيز على `q` و `v`، فإنك تقلل عدد المعلمات القابلة للتدريب، مما يؤدي إلى تسريع التدريب وتقليل استخدام الذاكرة. + +#### **الخطوة 4: التحقق من عدد المعلمات القابلة للتدريب** + +من السهل التحقق من عدد المعلمات القابلة للتدريب ومعرفة تأثير تعديلك. + +```python +model.print_trainable_parameters() +``` + +**الناتج المتوقع:** + +``` +عدد المعلمات القابلة للتدريب: 608,256 || جميع المعلمات: 94,343,728 || نسبة المعلمات القابلة للتدريب: 0.6447 +عدد المعلمات القابلة للتدريب: 912,384 || جميع المعلمات: 94,647,856 || نسبة المعلمات القابلة للتدريب: 0.9640 # مع k +``` + +## المساهمة بابداعاتك الخاصة + +يمكن لتعديل النماذج المسبقة التدريب أن يفتح آفاقًا جديدة للبحث والتطبيق. من خلال فهم وتعديل الآليات الداخلية للنماذج مثل SAM، يمكنك تخصيصها لتلبية احتياجاتك المحددة، وتحسين الأداء، وتجربة أفكار جديدة. + +إذا قمت بتطوير تعديﻻتك الخاصة لنماذج Transformers وترغب في مشاركتها، ففكر في المساهمة في هذه الوثيقة. + +- **إنشاء طلب سحب (Pull Request):** شارك تغييراتك وتحسيناتك في التعليمات البرمجية مباشرة في المستودع. +- **كتابة التوثيق:** قدم تفسيرات وأمثلة واضحة لتعديلاتك. +- **التفاعل مع المجتمع:** ناقش أفكارك واحصل على تعليقات من المطورين والباحثين الآخرين من خلال فتح مشكلة. diff --git a/docs/source/ar/modular_transformers.md b/docs/source/ar/modular_transformers.md new file mode 100644 index 00000000000000..b500fec1c92d25 --- /dev/null +++ b/docs/source/ar/modular_transformers.md @@ -0,0 +1,184 @@ +# المحولات النمطية + +مكتبة `transformers` هي إطار عمل ذو فلسفة محدد؛ يتم تعريف فلسفتنا في [الدليل المفاهيمي](./philosophy). + +جوهر هذه الفلسفة يتمثل في مبدأ [نموذج واحد، ملف واحد](https://huggingface.co/blog/transformers-design-philosophy) +في المكتبة. الجانب السلبي لهذا المكون هو تقييده لوراثة واستيراد مكونات الملفات. + +نتيجة لذلك، تتكرر مكونات النموذج عبر العديد من الملفات. يحتوي `transformers` على عدد كبير من طبقات الانتباه، يقارب عدد النماذج، والكثير منها متطابق. يتسبب هذا في تباعد عمليات التنفيذ المستقلة مع تطبيق الإصلاحات والتغييرات. +على أجزاء محددة من التعليمات البرمجية. + +ولمعالجة ذلك، اعتمدنا مفهوم "النسخ" في المكتبة. فبإضافة تعليق يُشير إلى أن التعليمات البرمجية هي نسخة من أخرى، نضمن من خلال أنظمة CI والأوامر المحلية عدم تباعد النسخ. لكن هذه العملية، رغم بساطتها، تُسبب إرهاقاً. كما أنها تزيد العبء على المساهمين، وهو ما نهدف إلى تجاوزه. + +غالباً ما تتطلب مساهمات النماذج إضافة تعليمات برمجية (حوالي 1000 سطر)، ومعالج (حوالي 500 سطر)، واختبارات، ووثائق، إلخ. ونادراً ما تقل مساهمات النماذج عن 3000-5000 سطر من التعليمات البرمجية، معظمها أكواد نمطية. هذا يرفع مستوى المساهمات، + +ونهدف مع المحولات النمطية إلى خفض هذا المستوى إلى حدّ مقبول. + +## ما هو؟ + +تقدم المحولات النمطية مفهوم ملف "نمطي" لمجلد نموذج. يقبل هذا الملف النمطي تعليمات برمجية +غير مقبولة عادة في ملفات النمذجة/المعالجة، حيث يسمح بالاستيراد من نماذج مجاورة وكذلك +الوراثة من الفئات إلى فئات أخرى. + +يعرّف هذا الملف النمطي النماذج والمعالجات وفئة التكوين التي سيتم تعريفها في وحداتهم +المتعلقة. + +وأخيرًا، يقدم هذا الميزة أداة `linter` جديدة والتي ستعمل على "تفكيك" الملف النمطي إلى بنية "نموذج واحد، ملف واحد" +هيكل الدليل. سيتم إنشاء هذه الملفات تلقائيًا في كل مرة يتم فيها تشغيل البرنامج النصي؛ مما يقلل من المساهمات المطلوبة +إلى الملف النمطي، وبالتالي فقط إلى التغييرات بين النموذج المساهم والنماذج الأخرى. + +سيقوم مستخدمو النموذج في النهاية باستيراد واستخدام واجهة الملف الواحد، لذا لا يتوقع حدوث أي تغيير هنا. من خلال القيام بذلك، +نأمل في الجمع بين أفضل ما في العالمين: تمكين المساهمات البسيطة مع الالتزام بفلسفتنا. + +لذلك، هذا بديل لعلامات `# Copied from`، ويمكن توقع انتقال النماذج المساهمة سابقًا إلى +تنسيق المحولات النمطية الجديد في الأشهر المقبلة. + +### التفاصيل + +تُبسط أداة "linter" الوراثة، مُنشئةً جميع الملفات المفردة من الملف النمطي، مع الحفاظ على شفافيتها أمام مستخدمي Python. حاليًا، تُبسط الأداة مستوىً واحدًا من الوراثة + +على سبيل المثال: +- إذا ورثت فئة التكوين من فئة أخرى وأضافت/حذفت معامل، فسيتم إما الإشارة إلى الملف المولد مباشرةً + (في حالة الإضافة) أو إزالته تمامًا (في حالة الحذف). +- إذا ورثت فئة من فئة أخرى، على سبيل المثال: `class GemmaModel(LlamaModel):`، تُستنتج التبعيات تلقائيًا + سيتم استنتاج جميع الوحدات الفرعية تلقائيًا من الفئة الأصلية. +- إذا قمت بتعريف وظائف جديدة في الملف `modular` واستخدمتها داخل الفئات، فستستنتج أداة linter ذلك تلقائيًا + +يجب أن تكون قادرًا على كتابة كل شيء (المجزىء اللغوي، ومُعالِج الصور، والنموذج، والتكوين) في الملف `modular`، وسيتم إنشاء الملفات المُقابلة تلقائيًا. + +### التطبيق + +[TODO] نقدم اختبارًا جديدًا، للتأكد من أن المحتوى المولد يتطابق مع ما هو موجود في `modular_xxxx.py` + +### الأمثلة + +هنا مثال سريع باستخدام BERT و RoBERTa. النموذجان مرتبطان ارتباطًا وثيقًا: يختلف تنفيذهما النموذجي في طبقة تضمين. + +بدلاً من إعادة تعريف النموذج بالكامل، إليك كيف يبدو ملف `modular_roberta.py` لفئات النمذجة والتكوين (لأغراض المثال، يتم تجاهل المجزىء اللغوي في هذا الوقت حيث أنه مختلف جدًا). + +```python +from torch import nn +from ..bert.configuration_bert import BertConfig +from ..bert.modeling_bert import ( + BertModel, + BertEmbeddings, + BertForMaskedLM +) + +# تكوين RoBERTa مطابق لتكوين BERT +class RobertaConfig(BertConfig): + model_type = 'roberta' + +# نعيد تعريف الإضافات هنا لتسليط الضوء على اختلاف معرف الحشو، ونعيد تعريف الإضافات الموضعية +class RobertaEmbeddings(BertEmbeddings): + def __init__(self, config): + super().__init__(config()) + + self.padding_idx = config.pad_token_id + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx + ) + +# نموذج RoBERTa مطابق لنموذج BERT، باستثناء طبقة الإضافات. +# نعيد تعريف الإضافات أعلاه، لذا هنا لا توجد حاجة لعمل إضافي +class RobertaModel(BertModel): + def __init__(self, config): + super().__init__(config) + self.embeddings = RobertaEmbeddings(config) + + +# الرؤوس الآن تحتاج فقط إلى إعادة تعريف النموذج داخل `RobertaModel` الصحيح +class RobertaForMaskedLM(BertForMaskedLM): + def __init__(self, config): + super().__init__(config) + self.model = RobertaModel(config) +``` + +لاحظ أنه إذا لم تستخدم الاعتماد الذي حددته، فستحصل على الخطأ التالي: + +```bash +ValueError: You defined `RobertaEmbeddings` in the modular_roberta.py, it should be used + when you define `BertModel`, as it is one of it's direct dependencies. Make sure + you use it in the `__init__` function. +``` + +بالإضافة إلى ذلك، قد تجد قائمة بالأمثلة هنا: + +## ما هو ليس كذلك + +ليس بديلاً لتعليمات برمجة النمذجة (بعد؟)، وإذا لم يكن نموذجك يعتمد على أي شيء آخر موجود من قبل، فيمكنك إضافة ملف `نمذجة` كالعادة. + + +## الاستخدام المتقدم + +### إزالة السمات والوظائف +لإزالة السمات التي لا تستخدم في نموذجك النمطي، والتي لا تريد رؤيتها في النمذجة المفككة: + +```python +class GemmaModel(LlamaModel): | class GemmaModel(PreTrainedModel): + def __init__(self, config): | def __init__(self, config): + super().__init__(self, eos_token) | super().__init__(config) + del self.embed_tokens | self.padding_idx = config.pad_token_id + | self.vocab_size = config.vocab_size + | + | self.layers = nn.ModuleList( + | [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + | ) + | self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + | self.rotary_emb = LlamaRotaryEmbedding(config=config) + | self.gradient_checkpointing = False + | + | # Initialize weights and apply final processing + | self.post_init() +``` +إذا قمت بالتحقق من `LlamaModel` الأصلي، فستجد `embed_tokens` الذي تمت إزالته هنا (كما هو متوقع!) + +إزالة وظيفة مشابهة، تحتاج فقط إلى كتابتها مع `raise ValueError("")` لمحاكاة السلوك الذي تريده فعليًا عند إزالة وظيفة أصلية في بايثون. + +```python +class GemmaTokenizer(LlamaTokenizer): + ... + + def get_spm_processor(self): + raise AttributeError("Not needed for Gemma") + + def unk_token_length(self): + raise AttributeError("Not needed for Gemma") +``` + +### تعريف وظائف جديدة + +إذا قمت بتعريف وظيفة جديدة في الملف `modular` لاستخدامها داخل فئة، على سبيل المثال + +```python +def my_new_function(*args, **kwargs): + # Do something here + pass + +class GemmaModel(LlamaModel): + def forward(*args, **kwargs): + # Call the function + example = my_new_function(*args, **kwargs) + # continue here +``` + +سيتم نسخ وظيفة `my_new_function` (وبشكل متكرر، أي وظائف أخرى جديدة يتم استدعاؤها في جسمها) تلقائيًا +في الملف الذي يتم استخدامه. + +### استدعاء `super()` +قمنا مؤخرًا بشحن بعض الميزات التي تسمح لك بالانتقال من: +```python +class GemmaTokenizer(LlamaTokenizer, PretrainedTokenizerFast): | class GemmaModel(nn.Module): + def __init__(self, eos_token=""): | def __init__(self): + eos_token = AddedToken(eos_token) | eos_token = AddedToken(eos_token) + PretrainedTokenizerFast.__init__(self, eos_token) | super().__init__(eos_token) +``` +هذا مفيد عندما لا تريد تفكيك استدعاء `super()`، وتريد التمييز بين أي استدعاء super init تقوم به! + +### التسمية الخاصة +ندعم الآن أيضًا حالات خاصة مثل +```python +class GemmaVisionModel(CLIPModel): + pass +``` +حيث اسم فئة `GemmaVision` الخاصة بك ليس هو نفسه `Gemma` النمطي. هذا مفيد للغاية للنماذج المركبة. diff --git a/docs/source/ar/tiktoken.md b/docs/source/ar/tiktoken.md new file mode 100644 index 00000000000000..6f3755d8670cdc --- /dev/null +++ b/docs/source/ar/tiktoken.md @@ -0,0 +1,41 @@ +# Tiktoken والتفاعل مع Transformers + +يتم دمج دعم ملفات نموذج tiktoken بسلاسة في 🤗 transformers عند تحميل النماذج +`from_pretrained` مع ملف `tokenizer.model` tiktoken على Hub، والذي يتم تحويله تلقائيًا إلى [المحلل اللغوي السريع](https://huggingface.co/docs/transformers/main/en/main_classes/tokenizer#transformers.PreTrainedTokenizerFast). + +### النماذج المعروفة التي تم إصدارها مع `tiktoken.model`: + - gpt2 + - llama3 + +## مثال على الاستخدام + +من أجل تحميل ملفات `tiktoken` في `transformers`، تأكد من أن ملف `tokenizer.model` هو ملف tiktoken وسيتم تحميله تلقائيًا عند التحميل `from_pretrained`. إليك كيفية تحميل مجزىء لغوي ونموذج، والذي +يمكن تحميله من نفس الملف بالضبط: + +```py +from transformers import AutoTokenizer + +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" +tokenizer = AutoTokenizer.from_pretrained(model_id, subfolder="original") +``` +## إنشاء مجزىء لغوي tiktoken + +لا يحتوي ملف `tokenizer.model` على أي معلومات حول الرموز أو الأنماط الإضافية. إذا كانت هذه الأمور مهمة، قم بتحويل المحلل اللغوي إلى `tokenizer.json`، وهو التنسيق المناسب لـ [`PreTrainedTokenizerFast`]. + +قم بتوليد ملف `tokenizer.model` باستخدام [tiktoken.get_encoding](https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/registry.py#L63) ثم قم بتحويله إلى `tokenizer.json` باستخدام [`convert_tiktoken_to_fast`]. + +```py + +from transformers.integrations.tiktoken import convert_tiktoken_to_fast +from tiktoken import get_encoding + +# يمكنك تحميل ترميزك المخصص أو الترميز الذي توفره OpenAI +encoding = get_encoding("gpt2") +convert_tiktoken_to_fast(encoding, "config/save/dir") +``` + +يتم حفظ ملف `tokenizer.json` الناتج في الدليل المحدد ويمكن تحميله باستخدام [`PreTrainedTokenizerFast`]. + +```py +tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir") +``` From 52d135426f387862db8c73ea5ab8f69338ee81c7 Mon Sep 17 00:00:00 2001 From: Henry Hyeonmok Ko <52618631+henryhmko@users.noreply.github.com> Date: Tue, 10 Dec 2024 09:08:55 -0800 Subject: [PATCH 020/110] Multiple typo fixes in NLP, Audio docs (#35181) Fixed multiple typos in Tutorials, NLP, and Audio sections --- docs/source/en/tasks/asr.md | 2 +- docs/source/en/tasks/multiple_choice.md | 4 ++-- docs/source/en/tasks/question_answering.md | 4 ++-- docs/source/en/tasks/summarization.md | 2 +- docs/source/en/tasks/translation.md | 2 +- src/transformers/integrations/peft.py | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/source/en/tasks/asr.md b/docs/source/en/tasks/asr.md index f3e068444ca556..87b8f024420ce6 100644 --- a/docs/source/en/tasks/asr.md +++ b/docs/source/en/tasks/asr.md @@ -112,7 +112,7 @@ The next step is to load a Wav2Vec2 processor to process the audio signal: >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base") ``` -The MInDS-14 dataset has a sampling rate of 8000kHz (you can find this information in its [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16000kHz to use the pretrained Wav2Vec2 model: +The MInDS-14 dataset has a sampling rate of 8000Hz (you can find this information in its [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16000Hz to use the pretrained Wav2Vec2 model: ```py >>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000)) diff --git a/docs/source/en/tasks/multiple_choice.md b/docs/source/en/tasks/multiple_choice.md index 06eb45eda99150..18b12f2166637e 100644 --- a/docs/source/en/tasks/multiple_choice.md +++ b/docs/source/en/tasks/multiple_choice.md @@ -419,7 +419,7 @@ Get the class with the highest probability: ```py >>> predicted_class = logits.argmax().item() >>> predicted_class -'0' +0 ``` @@ -448,7 +448,7 @@ Get the class with the highest probability: ```py >>> predicted_class = int(tf.math.argmax(logits, axis=-1)[0]) >>> predicted_class -'0' +0 ``` diff --git a/docs/source/en/tasks/question_answering.md b/docs/source/en/tasks/question_answering.md index 998010e67ca95f..41d7fd48cf816e 100644 --- a/docs/source/en/tasks/question_answering.md +++ b/docs/source/en/tasks/question_answering.md @@ -325,7 +325,7 @@ or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/no Evaluation for question answering requires a significant amount of postprocessing. To avoid taking up too much of your time, this guide skips the evaluation step. The [`Trainer`] still calculates the evaluation loss during training so you're not completely in the dark about your model's performance. -If have more time and you're interested in how to evaluate your model for question answering, take a look at the [Question answering](https://huggingface.co/course/chapter7/7?fw=pt#post-processing) chapter from the 🤗 Hugging Face Course! +If you have more time and you're interested in how to evaluate your model for question answering, take a look at the [Question answering](https://huggingface.co/course/chapter7/7?fw=pt#post-processing) chapter from the 🤗 Hugging Face Course! ## Inference @@ -397,7 +397,7 @@ Tokenize the text and return TensorFlow tensors: >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model") ->>> inputs = tokenizer(question, text, return_tensors="tf") +>>> inputs = tokenizer(question, context, return_tensors="tf") ``` Pass your inputs to the model and return the `logits`: diff --git a/docs/source/en/tasks/summarization.md b/docs/source/en/tasks/summarization.md index 7d7ecf1fbab6db..e16dd17dfe1fc8 100644 --- a/docs/source/en/tasks/summarization.md +++ b/docs/source/en/tasks/summarization.md @@ -283,7 +283,7 @@ Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]: ```py >>> from transformers.keras_callbacks import KerasMetricCallback ->>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) +>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set) ``` Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]: diff --git a/docs/source/en/tasks/translation.md b/docs/source/en/tasks/translation.md index 426ba1c340fb81..922cdc7241176a 100644 --- a/docs/source/en/tasks/translation.md +++ b/docs/source/en/tasks/translation.md @@ -290,7 +290,7 @@ Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]: ```py >>> from transformers.keras_callbacks import KerasMetricCallback ->>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) +>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set) ``` Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]: diff --git a/src/transformers/integrations/peft.py b/src/transformers/integrations/peft.py index ef09281431169f..69e674a2160643 100644 --- a/src/transformers/integrations/peft.py +++ b/src/transformers/integrations/peft.py @@ -108,7 +108,7 @@ def load_adapter( token (`str`, `optional`): - Whether to use authentication token to load the remote folder. Userful to load private repositories + Whether to use authentication token to load the remote folder. Useful to load private repositories that are on HuggingFace Hub. You might need to call `huggingface-cli login` and paste your tokens to cache it. device_map (`str` or `Dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*): From 217c47e31bc0cd442443e5b4a62c8bc2785d53ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=A9tan=20Lepage?= <33058747+GaetanLepage@users.noreply.github.com> Date: Tue, 10 Dec 2024 18:19:30 +0100 Subject: [PATCH 021/110] Only import torch.distributed if it is available (#35133) --- src/transformers/pytorch_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py index 5bdf8a355ddfaa..fab1b9118d18d3 100644 --- a/src/transformers/pytorch_utils.py +++ b/src/transformers/pytorch_utils.py @@ -38,8 +38,10 @@ is_torch_greater_or_equal_than_1_13 = parsed_torch_version_base >= version.parse("1.13") is_torch_greater_or_equal_than_1_12 = parsed_torch_version_base >= version.parse("1.12") +# Cache this result has it's a C FFI call which can be pretty time-consuming +_torch_distributed_available = torch.distributed.is_available() -if is_torch_greater_or_equal("2.5"): +if is_torch_greater_or_equal("2.5") and _torch_distributed_available: from torch.distributed.tensor import Replicate from torch.distributed.tensor.parallel import ( ColwiseParallel, From 91b8ab18b778ae9e2f8191866e018cd1dc7097be Mon Sep 17 00:00:00 2001 From: French_Ball <127096560+asdkfjsd@users.noreply.github.com> Date: Wed, 11 Dec 2024 01:58:47 +0800 Subject: [PATCH 022/110] [i18n-] Translating Benchmarks.md to Chinese (#35137) * add "Translating Benchmarks.md to Chinese " * Removed all the English original text (which was previously kept as comments in the document) and refined some of the Chinese expressions. --- docs/source/zh/_toctree.yml | 2 + docs/source/zh/benchmarks.md | 377 +++++++++++++++++++++++++++++++++++ 2 files changed, 379 insertions(+) create mode 100644 docs/source/zh/benchmarks.md diff --git a/docs/source/zh/_toctree.yml b/docs/source/zh/_toctree.yml index d4863efde710ea..c4c5890ed0b3f4 100644 --- a/docs/source/zh/_toctree.yml +++ b/docs/source/zh/_toctree.yml @@ -50,6 +50,8 @@ title: 导出为 TFLite - local: torchscript title: 导出为 TorchScript + - local: benchmarks + title: 对模型进行基准测试 - local: gguf title: 与 GGUF 格式的互操作性 - local: tiktoken diff --git a/docs/source/zh/benchmarks.md b/docs/source/zh/benchmarks.md new file mode 100644 index 00000000000000..2e9787c9a3bb6b --- /dev/null +++ b/docs/source/zh/benchmarks.md @@ -0,0 +1,377 @@ + + +# 基准测试 + + + +小提示:Hugging Face的基准测试工具已经不再更新,建议使用外部基准测试库来衡量Transformer模 +型的速度和内存复杂度。 + + + +[[open-in-colab]] + +让我们来看看如何对🤗 Transformers模型进行基准测试,以及进行测试的推荐策略和已有的基准测试结果。 + +如果您需要更详细的回答,可以在[这里](https://github.com/huggingface/notebooks/tree/main/examples/benchmark.ipynb)找到更多关于基准测试的内容。 + + +## 如何对🤗 Transformers模型进行基准测试 + +使用[`PyTorchBenchmark`]和[`TensorFlowBenchmark`]类可以灵活地对🤗 Transformers模型进行基准测试。这些基准测试类可以衡量模型在**推理**和**训练**过程中所需的**峰值内存**和**时间**。 + + + +这里的**推理**指的是一次前向传播(forward pass),而训练则指一次前向传播和反向传播(backward pass)。 + + + + +基准测试类 [`PyTorchBenchmark`] 和 [`TensorFlowBenchmark`] 需要分别传入 [`PyTorchBenchmarkArguments`] 和 [`TensorFlowBenchmarkArguments`] 类型的对象来进行实例化。这些类是数据类型,包含了所有相关的配置参数,用于其对应的基准测试类。 + +在下面的示例中,我们展示了如何对类型为 **bert-base-cased** 的BERT模型进行基准测试: + + + +```py +>>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments + +>>> args = PyTorchBenchmarkArguments(models=["google-bert/bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]) +>>> benchmark = PyTorchBenchmark(args) +``` + + +```py +>>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments + +>>> args = TensorFlowBenchmarkArguments( +... models=["google-bert/bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] +... ) +>>> benchmark = TensorFlowBenchmark(args) +``` + + + +在这里,基准测试的参数数据类接受了三个主要的参数,即 `models`、`batch_sizes` 和`sequence_lengths`。其中,`models` 是必需的参数,它期望一个来自[模型库](https://huggingface.co/models)的模型标识符列表。`batch_sizes` 和 `sequence_lengths` 是列表类型的参数,定义了进行基准测试时 `input_ids` 的批量大小和序列长度。 + +这些是基准测试数据类中可以配置的一些主要参数。除此之外,基准测试数据类中还可以配置很多其他参数。如需要查看更详细的配置参数,可以直接查看以下文件: + +* `src/transformers/benchmark/benchmark_args_utils.py` +* `src/transformers/benchmark/benchmark_args.py`(针对 PyTorch) +* `src/transformers/benchmark/benchmark_args_tf.py`(针对 TensorFlow) + +另外,您还可以通过在根目录下运行以下命令,查看针对 PyTorch 和 TensorFlow 的所有可配置参数的描述列表: +``` bash python examples/pytorch/benchmarking/run_benchmark.py --help ``` +这些命令将列出所有可以配置的参数,它们可以帮助您更加灵活地进行基准测试。 + + + + + + +以下代码通过`PyTorchBenchmarkArguments`设置模型批处理大小和序列长度,然后调用`benchmark.run()`执行基准测试。 + +```py +>>> results = benchmark.run() +>>> print(results) +==================== INFERENCE - SPEED - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Time in s +-------------------------------------------------------------------------------- +google-bert/bert-base-uncased 8 8 0.006 +google-bert/bert-base-uncased 8 32 0.006 +google-bert/bert-base-uncased 8 128 0.018 +google-bert/bert-base-uncased 8 512 0.088 +-------------------------------------------------------------------------------- + +==================== INFERENCE - MEMORY - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Memory in MB +-------------------------------------------------------------------------------- +google-bert/bert-base-uncased 8 8 1227 +google-bert/bert-base-uncased 8 32 1281 +google-bert/bert-base-uncased 8 128 1307 +google-bert/bert-base-uncased 8 512 1539 +-------------------------------------------------------------------------------- + +==================== ENVIRONMENT INFORMATION ==================== + +- transformers_version: 2.11.0 +- framework: PyTorch +- use_torchscript: False +- framework_version: 1.4.0 +- python_version: 3.6.10 +- system: Linux +- cpu: x86_64 +- architecture: 64bit +- date: 2020-06-29 +- time: 08:58:43.371351 +- fp16: False +- use_multiprocessing: True +- only_pretrain_model: False +- cpu_ram_mb: 32088 +- use_gpu: True +- num_gpus: 1 +- gpu: TITAN RTX +- gpu_ram_mb: 24217 +- gpu_power_watts: 280.0 +- gpu_performance_state: 2 +- use_tpu: False +``` + + +```bash +python examples/tensorflow/benchmarking/run_benchmark_tf.py --help +``` + +接下来,只需要调用 `benchmark.run()` 就能轻松运行已经实例化的基准测试对象。 + +```py +>>> results = benchmark.run() +>>> print(results) +>>> results = benchmark.run() +>>> print(results) +==================== INFERENCE - SPEED - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Time in s +-------------------------------------------------------------------------------- +google-bert/bert-base-uncased 8 8 0.005 +google-bert/bert-base-uncased 8 32 0.008 +google-bert/bert-base-uncased 8 128 0.022 +google-bert/bert-base-uncased 8 512 0.105 +-------------------------------------------------------------------------------- + +==================== INFERENCE - MEMORY - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Memory in MB +-------------------------------------------------------------------------------- +google-bert/bert-base-uncased 8 8 1330 +google-bert/bert-base-uncased 8 32 1330 +google-bert/bert-base-uncased 8 128 1330 +google-bert/bert-base-uncased 8 512 1770 +-------------------------------------------------------------------------------- + +==================== ENVIRONMENT INFORMATION ==================== + +- transformers_version: 2.11.0 +- framework: Tensorflow +- use_xla: False +- framework_version: 2.2.0 +- python_version: 3.6.10 +- system: Linux +- cpu: x86_64 +- architecture: 64bit +- date: 2020-06-29 +- time: 09:26:35.617317 +- fp16: False +- use_multiprocessing: True +- only_pretrain_model: False +- cpu_ram_mb: 32088 +- use_gpu: True +- num_gpus: 1 +- gpu: TITAN RTX +- gpu_ram_mb: 24217 +- gpu_power_watts: 280.0 +- gpu_performance_state: 2 +- use_tpu: False +``` + + + + + +在一般情况下,基准测试会测量推理(inference)的**时间**和**所需内存**。在上面的示例输出中,前两部分显示了与**推理时间**和**推理内存**对应的结果。与此同时,关于计算环境的所有相关信息(例如 GPU 类型、系统、库版本等)会在第三部分的**环境信息**中打印出来。你可以通过在 [`PyTorchBenchmarkArguments`] 和 [`TensorFlowBenchmarkArguments`] 中添加 `save_to_csv=True`参数,将这些信息保存到一个 .csv 文件中。在这种情况下,每一部分的信息会分别保存在不同的 .csv 文件中。每个 .csv 文件的路径也可以通过参数数据类进行定义。 + + +您可以选择不通过预训练模型的模型标识符(如 `google-bert/bert-base-uncased`)进行基准测试,而是对任何可用模型类的任意配置进行基准测试。在这种情况下,我们必须将一系列配置与基准测试参数一起传入,方法如下: + + + +```py +>>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments, BertConfig + +>>> args = PyTorchBenchmarkArguments( +... models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] +... ) +>>> config_base = BertConfig() +>>> config_384_hid = BertConfig(hidden_size=384) +>>> config_6_lay = BertConfig(num_hidden_layers=6) + +>>> benchmark = PyTorchBenchmark(args, configs=[config_base, config_384_hid, config_6_lay]) +>>> benchmark.run() +==================== INFERENCE - SPEED - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Time in s +-------------------------------------------------------------------------------- +bert-base 8 128 0.006 +bert-base 8 512 0.006 +bert-base 8 128 0.018 +bert-base 8 512 0.088 +bert-384-hid 8 8 0.006 +bert-384-hid 8 32 0.006 +bert-384-hid 8 128 0.011 +bert-384-hid 8 512 0.054 +bert-6-lay 8 8 0.003 +bert-6-lay 8 32 0.004 +bert-6-lay 8 128 0.009 +bert-6-lay 8 512 0.044 +-------------------------------------------------------------------------------- + +==================== INFERENCE - MEMORY - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Memory in MB +-------------------------------------------------------------------------------- +bert-base 8 8 1277 +bert-base 8 32 1281 +bert-base 8 128 1307 +bert-base 8 512 1539 +bert-384-hid 8 8 1005 +bert-384-hid 8 32 1027 +bert-384-hid 8 128 1035 +bert-384-hid 8 512 1255 +bert-6-lay 8 8 1097 +bert-6-lay 8 32 1101 +bert-6-lay 8 128 1127 +bert-6-lay 8 512 1359 +-------------------------------------------------------------------------------- + +==================== ENVIRONMENT INFORMATION ==================== + +- transformers_version: 2.11.0 +- framework: PyTorch +- use_torchscript: False +- framework_version: 1.4.0 +- python_version: 3.6.10 +- system: Linux +- cpu: x86_64 +- architecture: 64bit +- date: 2020-06-29 +- time: 09:35:25.143267 +- fp16: False +- use_multiprocessing: True +- only_pretrain_model: False +- cpu_ram_mb: 32088 +- use_gpu: True +- num_gpus: 1 +- gpu: TITAN RTX +- gpu_ram_mb: 24217 +- gpu_power_watts: 280.0 +- gpu_performance_state: 2 +- use_tpu: False +``` + + +```py +>>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments, BertConfig + +>>> args = TensorFlowBenchmarkArguments( +... models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] +... ) +>>> config_base = BertConfig() +>>> config_384_hid = BertConfig(hidden_size=384) +>>> config_6_lay = BertConfig(num_hidden_layers=6) + +>>> benchmark = TensorFlowBenchmark(args, configs=[config_base, config_384_hid, config_6_lay]) +>>> benchmark.run() +==================== INFERENCE - SPEED - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Time in s +-------------------------------------------------------------------------------- +bert-base 8 8 0.005 +bert-base 8 32 0.008 +bert-base 8 128 0.022 +bert-base 8 512 0.106 +bert-384-hid 8 8 0.005 +bert-384-hid 8 32 0.007 +bert-384-hid 8 128 0.018 +bert-384-hid 8 512 0.064 +bert-6-lay 8 8 0.002 +bert-6-lay 8 32 0.003 +bert-6-lay 8 128 0.0011 +bert-6-lay 8 512 0.074 +-------------------------------------------------------------------------------- + +==================== INFERENCE - MEMORY - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Memory in MB +-------------------------------------------------------------------------------- +bert-base 8 8 1330 +bert-base 8 32 1330 +bert-base 8 128 1330 +bert-base 8 512 1770 +bert-384-hid 8 8 1330 +bert-384-hid 8 32 1330 +bert-384-hid 8 128 1330 +bert-384-hid 8 512 1540 +bert-6-lay 8 8 1330 +bert-6-lay 8 32 1330 +bert-6-lay 8 128 1330 +bert-6-lay 8 512 1540 +-------------------------------------------------------------------------------- + +==================== ENVIRONMENT INFORMATION ==================== + +- transformers_version: 2.11.0 +- framework: Tensorflow +- use_xla: False +- framework_version: 2.2.0 +- python_version: 3.6.10 +- system: Linux +- cpu: x86_64 +- architecture: 64bit +- date: 2020-06-29 +- time: 09:38:15.487125 +- fp16: False +- use_multiprocessing: True +- only_pretrain_model: False +- cpu_ram_mb: 32088 +- use_gpu: True +- num_gpus: 1 +- gpu: TITAN RTX +- gpu_ram_mb: 24217 +- gpu_power_watts: 280.0 +- gpu_performance_state: 2 +- use_tpu: False +``` + + + + + **推理时间**和**推理所需内存**会被重新测量,不过这次是针对 `BertModel` 类的自定义配置进行基准测试。这个功能在决定模型应该使用哪种配置进行训练时尤其有用。 + + +## 基准测试的推荐策略 +本节列出了一些在对模型进行基准测试时比较推荐的策略: + +* 目前,该模块只支持单设备基准测试。在进行 GPU 基准测试时,建议用户通过设置 `CUDA_VISIBLE_DEVICES` 环境变量来指定代码应在哪个设备上运行,例如在运行代码前执行 `export CUDA_VISIBLE_DEVICES=0`。 +* `no_multi_processing` 选项仅应在测试和调试时设置为 `True`。为了确保内存测量的准确性,建议将每个内存基准测试单独运行在一个进程中,并确保 `no_multi_processing` 设置为 `True`。 +* 当您分享模型基准测试结果时,应始终提供环境信息。由于 GPU 设备、库版本等之间可能存在较大差异,单独的基准测试结果对社区的帮助有限。 + + +## 分享您的基准测试结果 + +先前的所有可用的核心模型(当时有10个)都已针对 **推理时间** 进行基准测试,涵盖了多种不同的设置:使用 PyTorch(包不包含 TorchScript),使用 TensorFlow(包不包含 XLA)。所有的测试都在 CPU(除了 TensorFlow XLA)和 GPU 上进行。 + +这种方法的详细信息可以在 [这篇博客](https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2) 中找到,测试结果可以在 [这里](https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing) 查看。 + + +您可以借助新的 **基准测试** 工具比以往任何时候都更容易地分享您的基准测试结果! + +- [PyTorch 基准测试结果](https://github.com/huggingface/transformers/tree/main/examples/pytorch/benchmarking/README.md) +- [TensorFlow 基准测试结果](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/benchmarking/README.md) + + From 5290f6a62dd4892b07ff290087591d8a625a9f1f Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Tue, 10 Dec 2024 11:36:25 -0800 Subject: [PATCH 023/110] [docs] Fix FlashAttention link (#35171) fix link --- docs/source/en/model_doc/idefics2.md | 2 +- docs/source/en/model_doc/llava_next_video.md | 2 +- docs/source/en/model_doc/mistral.md | 2 +- docs/source/en/model_doc/mixtral.md | 2 +- docs/source/en/model_doc/video_llava.md | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/en/model_doc/idefics2.md b/docs/source/en/model_doc/idefics2.md index 5ad56b7b5c525d..b9b51082f29e5b 100644 --- a/docs/source/en/model_doc/idefics2.md +++ b/docs/source/en/model_doc/idefics2.md @@ -141,7 +141,7 @@ Do note that when training Idefics2 on multi-turn conversations between a user a ## Model optimizations: Flash Attention -The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. +The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature. diff --git a/docs/source/en/model_doc/llava_next_video.md b/docs/source/en/model_doc/llava_next_video.md index f8a149f12b6779..cc3a61aae6c736 100644 --- a/docs/source/en/model_doc/llava_next_video.md +++ b/docs/source/en/model_doc/llava_next_video.md @@ -240,7 +240,7 @@ model = LlavaNextVideoForConditionalGeneration.from_pretrained("llava-hf/LLaVA-N ### Flash-Attention 2 to speed-up generation -Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. +Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. First, make sure to install the latest version of Flash Attention 2: diff --git a/docs/source/en/model_doc/mistral.md b/docs/source/en/model_doc/mistral.md index 2be657109a8d46..cfa2af3678137a 100644 --- a/docs/source/en/model_doc/mistral.md +++ b/docs/source/en/model_doc/mistral.md @@ -91,7 +91,7 @@ As can be seen, the instruction-tuned model requires a [chat template](../chat_t ## Speeding up Mistral by using Flash Attention -The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. +The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature. diff --git a/docs/source/en/model_doc/mixtral.md b/docs/source/en/model_doc/mixtral.md index 7afcaa798ecac4..b5451702e44a16 100644 --- a/docs/source/en/model_doc/mixtral.md +++ b/docs/source/en/model_doc/mixtral.md @@ -93,7 +93,7 @@ As can be seen, the instruction-tuned model requires a [chat template](../chat_t ## Speeding up Mixtral by using Flash Attention -The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. +The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature. diff --git a/docs/source/en/model_doc/video_llava.md b/docs/source/en/model_doc/video_llava.md index 105307196effd0..a3ba1258ecfa06 100644 --- a/docs/source/en/model_doc/video_llava.md +++ b/docs/source/en/model_doc/video_llava.md @@ -174,7 +174,7 @@ model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-L ### Flash-Attention 2 to speed-up generation -Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. +Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. First, make sure to install the latest version of Flash Attention 2: From e8508924fdfe1ad96e35d23e795eb7f7beec836a Mon Sep 17 00:00:00 2001 From: John Graham Reynolds Date: Tue, 10 Dec 2024 18:16:01 -0500 Subject: [PATCH 024/110] Update data collator docstrings to accurately reference Nvidia tensor core compute capability version (#35188) update data collator docs to reflect correct tensor core compute capability Co-authored-by: John Graham Reynolds --- src/transformers/data/data_collator.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index 9e75e6fd3c38df..e84c9d0ef3ce2e 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -256,7 +256,7 @@ class DataCollatorWithPadding: If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= - 7.5 (Volta). + 7.0 (Volta). return_tensors (`str`, *optional*, defaults to `"pt"`): The type of Tensor to return. Allowable values are "np", "pt" and "tf". """ @@ -308,7 +308,7 @@ class DataCollatorForTokenClassification(DataCollatorMixin): If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= - 7.5 (Volta). + 7.0 (Volta). label_pad_token_id (`int`, *optional*, defaults to -100): The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions). return_tensors (`str`, *optional*, defaults to `"pt"`): @@ -568,7 +568,7 @@ class DataCollatorForSeq2Seq: If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= - 7.5 (Volta). + 7.0 (Volta). label_pad_token_id (`int`, *optional*, defaults to -100): The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions). return_tensors (`str`, *optional*, defaults to `"pt"`): @@ -693,6 +693,9 @@ class DataCollatorForLanguageModeling(DataCollatorMixin): The probability with which to (randomly) mask tokens in the input, when `mlm` is set to `True`. pad_to_multiple_of (`int`, *optional*): If set will pad the sequence to a multiple of the provided value. + + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= + 7.0 (Volta). return_tensors (`str`): The type of Tensor to return. Allowable values are "np", "pt" and "tf". From 10feacd88aef9569e240b7e3833ab32b297e4460 Mon Sep 17 00:00:00 2001 From: HMJ0628 <2383422508@qq.com> Date: Wed, 11 Dec 2024 07:16:37 +0800 Subject: [PATCH 025/110] [i18n-] Translating agents.md to Chinese (#35139) * add "translate agents.md" * add "agents.md" * add "translate warnings" * add "totree" * add "remove transformer_agent" * add "remove transformer _agent file" --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/zh/_toctree.yml | 6 +- docs/source/zh/agents.md | 427 ++++++++++++++++++++++++++ docs/source/zh/transformers_agents.md | 285 ----------------- 3 files changed, 430 insertions(+), 288 deletions(-) create mode 100644 docs/source/zh/agents.md delete mode 100644 docs/source/zh/transformers_agents.md diff --git a/docs/source/zh/_toctree.yml b/docs/source/zh/_toctree.yml index c4c5890ed0b3f4..bd0cc7c7f7f97d 100644 --- a/docs/source/zh/_toctree.yml +++ b/docs/source/zh/_toctree.yml @@ -23,8 +23,8 @@ title: 使用🤗 PEFT加载和训练adapters - local: model_sharing title: 分享您的模型 - - local: transformers_agents - title: agents教程 + - local: agents + title: 智能体和工具 - local: llm_tutorial title: 使用LLMs进行生成 title: 教程 @@ -102,7 +102,7 @@ - sections: - sections: - local: main_classes/agent - title: Agents和工具 + title: 智能体和工具 - local: main_classes/callback title: Callbacks - local: main_classes/configuration diff --git a/docs/source/zh/agents.md b/docs/source/zh/agents.md new file mode 100644 index 00000000000000..00fa74e6545025 --- /dev/null +++ b/docs/source/zh/agents.md @@ -0,0 +1,427 @@ + +# 智能体和工具 + +[[在colab里打开]] + +### 什么是智能体 (Agent)? + +大型语言模型(LLM)经过 [因果语言建模训练](./tasks/language_modeling) 可以应对各种任务,但在一些基本任务(如逻辑推理、计算和搜索)上常常表现不佳。当它们被用在自己不擅长的领域时,往往无法生成我们期望的答案。 + +为了解决这个问题,可以创建**智能体**. + +智能体是一个系统,它使用 LLM 作为引擎,并且能够访问称为**工具**的功能。 + +这些**工具**是执行任务的函数,包含所有必要的描述信息,帮助智能体正确使用它们。 + +智能体可以被编程为: +- 一次性设计一系列工具并同时执行它们,像 [`CodeAgent`] +- 一次执行一个工具,并等待每个工具的结果后再启动下一个,像 [`ReactJsonAgent`] + +### 智能体类型 + +#### 代码智能体 + +此智能体包含一个规划步骤,然后生成 Python 代码一次性执行所有任务。它原生支持处理不同输入和输出类型,因此推荐用于多模态任务。 + +#### 推理智能体 + +这是解决推理任务的首选代理,因为 ReAct 框架 ([Yao et al., 2022](https://huggingface.co/papers/2210.03629)) 使其在基于之前观察进行推理时非常高效。 + +我们实现了两种版本的 ReactJsonAgent: +- [`ReactJsonAgent`] 将工具调用作为 JSON 格式输出。 +- [`ReactCodeAgent`] 是 ReactJsonAgent 的一种新型,生成工具调用的代码块,对于具备强大编程能力的 LLM 非常适用。 + +> [TIP] +> 阅读 [Open-source LLMs as LangChain Agents](https://huggingface.co/blog/open-source-llms-as-agents) 博文,了解更多关于推理智能体的信息。 + +
+ + +
+ +![推理智能体的框架](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/open-source-llms-as-agents/ReAct.png) + +以下是一个推理代码智能体如何处理以下问题的示例: + +```py3 +>>> agent.run( +... "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?", +... ) +=====New task===== +How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need? +====Agent is executing the code below: +bert_blocks = search(query="number of blocks in BERT base encoder") +print("BERT blocks:", bert_blocks) +==== +Print outputs: +BERT blocks: twelve encoder blocks + +====Agent is executing the code below: +attention_layer = search(query="number of layers in Attention is All You Need") +print("Attention layers:", attention_layer) +==== +Print outputs: +Attention layers: Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position- 2 Page 3 Figure 1: The Transformer - model architecture. + +====Agent is executing the code below: +bert_blocks = 12 +attention_layers = 6 +diff = bert_blocks - attention_layers +print("Difference in blocks:", diff) +final_answer(diff) +==== + +Print outputs: +Difference in blocks: 6 + +Final answer: 6 +``` + +### 如何构建智能体? + +要初始化一个智能体,您需要以下参数: + +- **一个 LLM** 来驱动智能体——智能体本身并不是 LLM,而是一个使用 LLM 作为引擎的程序。 +- **一个系统提示**:告诉 LLM 引擎应该如何生成输出。 +- **一个工具箱**,智能体可以从中选择工具执行。 +- **一个解析器**,从 LLM 输出中提取出哪些工具需要调用,以及使用哪些参数。 + +在智能体系统初始化时,工具属性将生成工具描述,并嵌入到智能体的系统提示中,告知智能体可以使用哪些工具,并且为什么使用它们。 + +**安装依赖** + +首先,您需要安装**智能体**所需的额外依赖: + +```bash +pip install transformers[agents] +``` +**创建LLM引擎** + +定义一个 `llm_engine` 方法,该方法接受一系列[消息](./chat_templating)并返回文本。该 `callable` 还需要接受一个 `stop` 参数,用于指示何时停止生成输出。 + +```python +from huggingface_hub import login, InferenceClient + +login("") + +client = InferenceClient(model="meta-llama/Meta-Llama-3-70B-Instruct") + +def llm_engine(messages, stop_sequences=["Task"]) -> str: + response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000) + answer = response.choices[0].message.content + return answer +``` + +您可以使用任何符合以下要求的 `llm_engine` 方法: +1. [输入格式](./chat_templating)为 (`List[Dict[str, str]]`),并且返回一个字符串。 +2. 它在 `stop_sequences` 参数传递的序列处停止生成输出。 + +此外,`llm_engine` 还可以接受一个 `grammar` 参数。如果在智能体初始化时指定了 `grammar`,则该参数将传递给 `llm_engine` 的调用,以允许[受限生成](https://huggingface.co/docs/text-generation-inference/conceptual/guidance),以强制生成格式正确的智能体输出。 + +您还需要一个 `tools` 参数,它接受一个 `Tools` 列表 —— 可以是空列表。您也可以通过定义可选参数 `add_base_tools=True` 来将默认工具箱添加到工具列表中。 + +现在,您可以创建一个智能体,例如 [`CodeAgent`],并运行它。您还可以创建一个 [`TransformersEngine`],使用 `transformers` 在本地机器上运行预初始化的推理管道。 为了方便起见,由于智能体行为通常需要更强大的模型,例如 `Llama-3.1-70B-Instruct`,它们目前较难在本地运行,我们还提供了 [`HfApiEngine`] 类,它在底层初始化了一个 `huggingface_hub.InferenceClient`。 + +```python +from transformers import CodeAgent, HfApiEngine + +llm_engine = HfApiEngine(model="meta-llama/Meta-Llama-3-70B-Instruct") +agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True) + +agent.run( + "Could you translate this sentence from French, say it out loud and return the audio.", + sentence="Où est la boulangerie la plus proche?", +) +``` + +当你急需某个东西时这将会很有用! +您甚至可以将 `llm_engine` 参数留空,默认情况下会创建一个 [`HfApiEngine`]。 + +```python +from transformers import CodeAgent + +agent = CodeAgent(tools=[], add_base_tools=True) + +agent.run( + "Could you translate this sentence from French, say it out loud and give me the audio.", + sentence="Où est la boulangerie la plus proche?", +) +``` + +请注意,我们使用了额外的 `sentence` 参数:您可以将文本作为附加参数传递给模型。 + +您还可以使用这个来指定本地或远程文件的路径供模型使用: + +```py +from transformers import ReactCodeAgent + +agent = ReactCodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True) + +agent.run("Why does Mike not know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3") +``` + +系统提示和输出解析器会自动定义,但您可以通过调用智能体的 `system_prompt_template` 来轻松查看它们。 + +```python +print(agent.system_prompt_template) +``` + +尽可能清楚地解释您要执行的任务非常重要。 每次 [`~Agent.run`] 操作都是独立的,并且由于智能体是由 LLM 驱动的,提示中的细微变化可能会导致完全不同的结果。 +您还可以连续运行多个任务,每次都会重新初始化智能体的 `agent.task` 和 `agent.logs` 属性。 + + +#### 代码执行 + +Python 解释器在一组输入和工具上执行代码。 这应该是安全的,因为只能调用您提供的工具(特别是 Hugging Face 的工具)和 print 函数,因此您已经限制了可以执行的操作。 + +Python 解释器默认不允许导入不在安全列表中的模块,因此大多数明显的攻击问题应该不成问题。 您仍然可以通过在 [`ReactCodeAgent`] 或 [`CodeAgent`] 初始化时通过 `additional_authorized_imports` 参数传递一个授权的模块列表来授权额外的导入: + +```py +>>> from transformers import ReactCodeAgent + +>>> agent = ReactCodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4']) +>>> agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?") + +(...) +'Hugging Face – Blog' +``` + +如果有任何代码尝试执行非法操作,或者生成的代码出现常规 Python 错误,执行将停止。 + +> [!WARNING] +> 在使用大语言模型(LLM)生成代码时,生成的代码会被执行,避免导入或使用任何不安全的库或模块。 + +### 系统提示 + +智能体,或者说驱动智能体的 LLM,根据系统提示生成输出。系统提示可以定制并根据目标任务进行调整。例如,检查 [`ReactCodeAgent`] 的系统提示(以下版本经过简化)。 + +```text +You will be given a task to solve as best you can. +You have access to the following tools: +<> + +To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences. + +At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task, then the tools that you want to use. +Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '/End code' sequence. +During each intermediate step, you can use 'print()' to save whatever important information you will then need. +These print outputs will then be available in the 'Observation:' field, for using this information as input for the next step. + +In the end you have to return a final answer using the `final_answer` tool. + +Here are a few examples using notional tools: +--- +{examples} + +Above example were using notional tools that might not exist for you. You only have acces to those tools: +<> +You also can perform computations in the python code you generate. + +Always provide a 'Thought:' and a 'Code:\n```py' sequence ending with '```' sequence. You MUST provide at least the 'Code:' sequence to move forward. + +Remember to not perform too many operations in a single code block! You should split the task into intermediate code blocks. +Print results at the end of each step to save the intermediate results. Then use final_answer() to return the final result. + +Remember to make sure that variables you use are all defined. + +Now Begin! +``` + +系统提示包括: +- 解释智能体应该如何工作以及工具的**介绍**。 +- 所有工具的描述由 `<>` 标记在运行时动态替换,这样智能体就知道可以使用哪些工具及其用途。 + - 工具的描述来自工具的属性,`name`、`description`、`inputs` 和 `output_type`,以及一个简单的 `jinja2` 模板,您可以根据需要进行调整。 +- 期望的输出格式。 + +您可以通过向 `system_prompt` 参数传递自定义提示来最大程度地提高灵活性,从而覆盖整个系统提示模板。 + +```python +from transformers import ReactJsonAgent +from transformers.agents import PythonInterpreterTool + +agent = ReactJsonAgent(tools=[PythonInterpreterTool()], system_prompt="{your_custom_prompt}") +``` + +> [WARNING] +> 必须在`template`中定义 `<>` 这个变量,以便智能体能够正确地识别并使用可用的工具 + + +### 检查智能体的运行 + +以下是检查运行后发生了什么的一些有用属性: +- `agent.logs` 存储了智能体的详细日志。每一步的所有内容都会存储在一个字典中,然后附加到 `agent.logs`。 +- 运行 `agent.write_inner_memory_from_logs()` 会从日志中创建智能体的内存,以便 LLM 查看,作为一系列聊天消息。此方法会遍历日志的每个步骤,只保存其感兴趣的消息:例如,它会单独保存系统提示和任务,然后为每个步骤保存 LLM 输出的消息,以及工具调用输出的消息。如果您想要更高层次的查看发生了什么,可以使用此方法 —— 但并不是每个日志都会被此方法转录。 + +## 工具 + +工具是智能体使用的基本功能。 + +例如,您可以检查 [`PythonInterpreterTool`]:它有一个名称、描述、输入描述、输出类型和 `__call__` 方法来执行该操作。 + +当智能体初始化时,工具属性会用来生成工具描述,然后将其嵌入到智能体的系统提示中,这让智能体知道可以使用哪些工具以及为什么使用它们。 + +### 默认工具箱 + +Transformers 提供了一个默认工具箱,用于增强智能体,您可以在初始化时通过 `add_base_tools=True` 参数将其添加到智能体中: + +- **文档问答**:给定一个文档(如图像格式的 PDF),回答关于该文档的问题([Donut](./model_doc/donut)) +- **图像问答**:给定一张图片,回答关于该图像的问题([VILT](./model_doc/vilt)) +- **语音转文本**:给定一个人讲述的音频录音,将其转录为文本(Whisper) +- **文本转语音**:将文本转换为语音([SpeechT5](./model_doc/speecht5)) +- **翻译**:将给定的句子从源语言翻译为目标语言 +- **DuckDuckGo 搜索**:使用 `DuckDuckGo` 浏览器进行网络搜索 +- **Python 代码解释器**:在安全环境中运行 LLM 生成的 Python 代码。只有在初始化 [`ReactJsonAgent`] 时将 `add_base_tools=True` 时,代码智能体才会添加此工具,因为基于代码的智能体已经能够原生执行 Python 代码 + + +您可以通过调用 [`load_tool`] 函数来手动使用某个工具并执行任务。 + + +```python +from transformers import load_tool + +tool = load_tool("text-to-speech") +audio = tool("This is a text to speech tool") +``` + + +### 创建新工具 + +您可以为 `Hugging Face` 默认工具无法涵盖的用例创建自己的工具。 +例如,假设我们要创建一个返回在 `Hugging Face Hub` 上某个任务中下载次数最多的模型的工具。 + +您将从以下代码开始: + +```python +from huggingface_hub import list_models + +task = "text-classification" + +model = next(iter(list_models(filter=task, sort="downloads", direction=-1))) +print(model.id) +``` + +这段代码可以很快转换为工具,只需将其包装成一个函数,并添加 `tool` 装饰器: + + +```py +from transformers import tool + +@tool +def model_download_tool(task: str) -> str: + """ + This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. + It returns the name of the checkpoint. + + Args: + task: The task for which + """ + model = next(iter(list_models(filter="text-classification", sort="downloads", direction=-1))) + return model.id +``` + +该函数需要: +- 一个清晰的名称。名称通常描述工具的功能。由于代码返回某个任务中下载次数最多的模型,因此我们将其命名为 `model_download_tool`。 +- 对输入和输出进行类型提示 +- 描述,其中包括 "`Args`:" 部分,描述每个参数(这次不需要类型指示,它会从类型提示中获取)。 + +所有这些将自动嵌入到智能体的系统提示中,因此请尽量使它们尽可能清晰! + +> [TIP] +> 这个定义格式与 apply_chat_template 中使用的工具模式相同,唯一的区别是添加了 tool 装饰器:可以在我们的工具使用 API 中[了解更多](https://huggingface.co/blog/unified-tool-use#passing-tools-to-a-chat-template). + +然后,您可以直接初始化您的智能体: +```py +from transformers import CodeAgent +agent = CodeAgent(tools=[model_download_tool], llm_engine=llm_engine) +agent.run( + "Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?" +) +``` + +您将得到以下输出: +```text +======== New task ======== +Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub? +==== Agent is executing the code below: +most_downloaded_model = model_download_tool(task="text-to-video") +print(f"The most downloaded model for the 'text-to-video' task is {most_downloaded_model}.") +==== +``` + +输出: +`"The most downloaded model for the 'text-to-video' task is ByteDance/AnimateDiff-Lightning."` + +### 管理智能体的工具箱 + +如果您已经初始化了一个智能体,但想添加一个新的工具,重新初始化智能体会很麻烦。借助 Transformers,您可以通过添加或替换工具来管理智能体的工具箱。 + +让我们将 `model_download_tool` 添加到一个仅初始化了默认工具箱的现有智能体中。 + +```python +from transformers import CodeAgent + +agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True) +agent.toolbox.add_tool(model_download_tool) +``` +现在,我们可以同时使用新工具和之前的文本到语音工具: + +```python +agent.run( + "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub and return the audio?" +) +``` + + +| **Audio** | +|------------------------------------------------------------------------------------------------------------------------------------------------------| +|