diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py index be8952903e2ce2..71c75dac2ff053 100644 --- a/.circleci/create_circleci_config.py +++ b/.circleci/create_circleci_config.py @@ -55,6 +55,7 @@ def to_dict(self): return { "docker": copy.deepcopy(DEFAULT_DOCKER_IMAGE), + "resource_class": "small", "steps": steps, } @@ -67,9 +68,9 @@ class CircleCIJob: install_steps: List[str] = None marker: Optional[str] = None parallelism: Optional[int] = 0 - pytest_num_workers: int = 12 + pytest_num_workers: int = 8 pytest_options: Dict[str, Any] = None - resource_class: Optional[str] = "2xlarge" + resource_class: Optional[str] = "xlarge" tests_to_run: Optional[List[str]] = None num_test_files_per_worker: Optional[int] = 10 # This should be only used for doctest job! @@ -198,7 +199,6 @@ def job_name(self): docker_image=[{"image": "huggingface/transformers-torch-light"}], marker="not generate", parallelism=6, - pytest_num_workers=8 ) generate_job = CircleCIJob( @@ -206,28 +206,24 @@ def job_name(self): docker_image=[{"image": "huggingface/transformers-torch-light"}], marker="generate", parallelism=6, - pytest_num_workers=8 ) tokenization_job = CircleCIJob( "tokenization", docker_image=[{"image": "huggingface/transformers-torch-light"}], parallelism=8, - pytest_num_workers=16 ) processor_job = CircleCIJob( "processors", docker_image=[{"image": "huggingface/transformers-torch-light"}], parallelism=8, - pytest_num_workers=6 ) tf_job = CircleCIJob( "tf", docker_image=[{"image":"huggingface/transformers-tf-light"}], parallelism=6, - pytest_num_workers=16, ) @@ -235,7 +231,8 @@ def job_name(self): "flax", docker_image=[{"image":"huggingface/transformers-jax-light"}], parallelism=6, - pytest_num_workers=16 + pytest_num_workers=16, + resource_class="2xlarge", ) @@ -244,7 +241,7 @@ def job_name(self): additional_env={"RUN_PIPELINE_TESTS": True}, docker_image=[{"image":"huggingface/transformers-torch-light"}], marker="is_pipeline_test", - parallelism=4 + parallelism=4, ) @@ -253,7 +250,7 @@ def job_name(self): additional_env={"RUN_PIPELINE_TESTS": True}, docker_image=[{"image":"huggingface/transformers-tf-light"}], marker="is_pipeline_test", - parallelism=4 + parallelism=4, ) @@ -270,7 +267,6 @@ def job_name(self): docker_image=[{"image":"huggingface/transformers-examples-torch"}], # TODO @ArthurZucker remove this once docker is easier to build install_steps=["uv venv && uv pip install . && uv pip install -r examples/pytorch/_tests_requirements.txt"], - pytest_num_workers=8, ) @@ -278,7 +274,6 @@ def job_name(self): "examples_tensorflow", additional_env={"OMP_NUM_THREADS": 8}, docker_image=[{"image":"huggingface/transformers-examples-tf"}], - pytest_num_workers=16, ) @@ -293,6 +288,7 @@ def job_name(self): ], marker="is_staging_test", pytest_num_workers=2, + resource_class="medium", ) @@ -305,13 +301,13 @@ def job_name(self): ], pytest_options={"k onnx": None}, pytest_num_workers=1, + resource_class="small", ) exotic_models_job = CircleCIJob( "exotic_models", docker_image=[{"image":"huggingface/transformers-exotic-models"}], - pytest_num_workers=12, parallelism=4, pytest_options={"durations": 100}, ) @@ -330,7 +326,6 @@ def job_name(self): docker_image=[{"image": "huggingface/transformers-torch-light"}], marker="not generate", parallelism=6, - pytest_num_workers=8, ) diff --git a/.github/workflows/self-nightly-past-ci-caller.yml b/.github/workflows/self-nightly-past-ci-caller.yml index 142399a6366ce6..46d811d4a43394 100644 --- a/.github/workflows/self-nightly-past-ci-caller.yml +++ b/.github/workflows/self-nightly-past-ci-caller.yml @@ -21,39 +21,6 @@ jobs: echo "$(python3 -c 'print(int(${{ github.run_number }}) % 10)')" echo "run_number=$(python3 -c 'print(int(${{ github.run_number }}) % 10)')" >> $GITHUB_OUTPUT - run_past_ci_pytorch_1-13: - name: PyTorch 1.13 - needs: get_number - if: needs.get_number.outputs.run_number == 0 && (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) - uses: ./.github/workflows/self-past-caller.yml - with: - framework: pytorch - version: "1.13" - sha: ${{ github.sha }} - secrets: inherit - - run_past_ci_pytorch_1-12: - name: PyTorch 1.12 - needs: get_number - if: needs.get_number.outputs.run_number == 1 && (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) - uses: ./.github/workflows/self-past-caller.yml - with: - framework: pytorch - version: "1.12" - sha: ${{ github.sha }} - secrets: inherit - - run_past_ci_pytorch_1-11: - name: PyTorch 1.11 - needs: get_number - if: needs.get_number.outputs.run_number == 2 && (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) - uses: ./.github/workflows/self-past-caller.yml - with: - framework: pytorch - version: "1.11" - sha: ${{ github.sha }} - secrets: inherit - run_past_ci_tensorflow_2-11: name: TensorFlow 2.11 needs: get_number diff --git a/README.md b/README.md index c748e675066202..42403f84b885da 100644 --- a/README.md +++ b/README.md @@ -249,7 +249,7 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta ### With pip -This repository is tested on Python 3.9+, Flax 0.4.1+, PyTorch 1.11+, and TensorFlow 2.6+. +This repository is tested on Python 3.9+, Flax 0.4.1+, PyTorch 2.0+, and TensorFlow 2.6+. You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile index 089be4a4460101..3cb2acdc53bb1a 100755 --- a/docker/transformers-quantization-latest-gpu/Dockerfile +++ b/docker/transformers-quantization-latest-gpu/Dockerfile @@ -50,6 +50,9 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/pef # Add aqlm for quantization testing RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2 +# Add vptq for quantization testing +RUN python3 -m pip install --no-cache-dir vptq + # Add hqq for quantization testing RUN python3 -m pip install --no-cache-dir hqq diff --git a/docs/source/ar/_toctree.yml b/docs/source/ar/_toctree.yml index 138d3a1bd8aa08..287f4dffbb384e 100644 --- a/docs/source/ar/_toctree.yml +++ b/docs/source/ar/_toctree.yml @@ -157,6 +157,8 @@ # title: AWQ # - local: quantization/aqlm # title: AQLM +# - local: quantization/vptq +# title: VPTQ # - local: quantization/quanto # title: Quanto # - local: quantization/eetq diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 8138dd41d80c12..18de03e1df8016 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -167,6 +167,8 @@ title: AWQ - local: quantization/aqlm title: AQLM + - local: quantization/vptq + title: VPTQ - local: quantization/quanto title: Quanto - local: quantization/eetq diff --git a/docs/source/en/add_new_pipeline.md b/docs/source/en/add_new_pipeline.md index e646f832831504..e8234c565b26c8 100644 --- a/docs/source/en/add_new_pipeline.md +++ b/docs/source/en/add_new_pipeline.md @@ -184,7 +184,7 @@ class PairClassificationPipeline(Pipeline): ``` The implementation is framework agnostic, and will work for PyTorch and TensorFlow models. If we have saved this in -a file named `pair_classification.py`, we can then import it and register it like this. The [register_pipeline](https://github.com/huggingface/transformers/blob/9feae5fb0164e89d4998e5776897c16f7330d3df/src/transformers/pipelines/base.py#L1387) function registers the pipeline details (task type, pipeline class, supported backends) to a models `config.json` file. +a file named `pair_classification.py`, we can then import it and register it like this. ```py from pair_classification import PairClassificationPipeline @@ -199,6 +199,22 @@ PIPELINE_REGISTRY.register_pipeline( ) ``` +The [register_pipeline](https://github.com/huggingface/transformers/blob/9feae5fb0164e89d4998e5776897c16f7330d3df/src/transformers/pipelines/base.py#L1387) function registers the pipeline details (task type, pipeline class, supported backends) to a models `config.json` file. + +```json + "custom_pipelines": { + "pair-classification": { + "impl": "pair_classification.PairClassificationPipeline", + "pt": [ + "AutoModelForSequenceClassification" + ], + "tf": [ + "TFAutoModelForSequenceClassification" + ], + } + }, +``` + Once this is done, we can use it with a pretrained model. For instance `sgugger/finetuned-bert-mrpc` has been fine-tuned on the MRPC dataset, which classifies pairs of sentences as paraphrases or not. diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md index a54ac432006a84..d8931342ee45f8 100644 --- a/docs/source/en/internal/generation_utils.md +++ b/docs/source/en/internal/generation_utils.md @@ -352,6 +352,8 @@ A [`Constraint`] can be used to force the generation to include specific tokens [[autodoc]] TextIteratorStreamer +[[autodoc]] AsyncTextIteratorStreamer + ## Caches [[autodoc]] Cache diff --git a/docs/source/en/llm_optims.md b/docs/source/en/llm_optims.md index e97ace8a625050..17ebb841de7a39 100644 --- a/docs/source/en/llm_optims.md +++ b/docs/source/en/llm_optims.md @@ -473,7 +473,7 @@ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable Quantization reduces the size of the LLM weights by storing them in a lower precision. This translates to lower memory usage and makes loading LLMs for inference more accessible if you're constrained by your GPUs memory. If you aren't limited by your GPU, you don't necessarily need to quantize your model because it can incur a small latency cost (except for AWQ and fused AWQ modules) due to the extra step required to quantize and dequantize the weights. > [!TIP] -> There are many quantization libraries (see the [Quantization](./quantization) guide for more details) available, such as Quanto, AQLM, AWQ, and AutoGPTQ. Feel free to try them out and see which one works best for your use case. We also recommend reading the [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) blog post which compares AutoGPTQ and bitsandbytes. +> There are many quantization libraries (see the [Quantization](./quantization) guide for more details) available, such as Quanto, AQLM, VPTQ, AWQ, and AutoGPTQ. Feel free to try them out and see which one works best for your use case. We also recommend reading the [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) blog post which compares AutoGPTQ and bitsandbytes. Use the Model Memory Calculator below to estimate and compare how much memory is required to load a model. For example, try estimating how much memory it costs to load [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1). diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md index 3f44569697777b..9b500b69374c88 100755 --- a/docs/source/en/main_classes/quantization.md +++ b/docs/source/en/main_classes/quantization.md @@ -34,6 +34,10 @@ Learn how to quantize models in the [Quantization](../quantization) guide. [[autodoc]] AqlmConfig +## VptqConfig + +[[autodoc]] VptqConfig + ## AwqConfig [[autodoc]] AwqConfig diff --git a/docs/source/en/modular_transformers.md b/docs/source/en/modular_transformers.md index 1516233ec4d6e1..8eebbf347c11c3 100644 --- a/docs/source/en/modular_transformers.md +++ b/docs/source/en/modular_transformers.md @@ -22,6 +22,9 @@ etc. Model contribution PRs rarely add less than 3-5k lines of code, with much o This raises the bar for contributions, and with Modular Transformers, we're aiming to lower the bar to a much more acceptable point. +If you plan to add a model to `transformers` make sure you read [How to add a model to 🤗 Transformers?](https://huggingface.co/docs/transformers/add_new_model). +For any kind of contributions, see [CONTRIBUTING.md](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md). + ## What is it? Modular Transformers introduces the concept of a "modular" file to a model folder. This modular file accepts code @@ -43,6 +46,12 @@ be moved to the new Modular Transformers format in the coming months. ### Details +To generate a single file from the modular file, run the following command. + +```bash +python utils/modular_model_converter.py --files-to-parse src/transformers/models//modular_.py +``` + The "linter", which unravels the inheritance and creates all single-files from the modular file, will flatten the inheritance while trying to be invisible to Python users. At this time, the linter flattens a **single** level of inheritance. @@ -59,7 +68,11 @@ file, and the corresponding files will be created for you. ### Enforcement -[TODO] We are introducing a new test, that makes sure the generated content matches what is present in the `modular_xxxx.py` +Run the command below to ensure the generated content matches `modular_.py` + +```bash +python utils/check_modular_conversion.py --files src/transformers/models//modular_.py +``` ### Examples @@ -194,4 +207,4 @@ We now also support special cases like class GemmaVisionModel(CLIPModel): pass ``` -where the name of your class `GemmaVision` is not the same as the modular `Gemma`. This is super useful for composite models. \ No newline at end of file +where the name of your class `GemmaVision` is not the same as the modular `Gemma`. This is super useful for composite models. diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md index 0fb72d26058e55..f3508aed0674f6 100644 --- a/docs/source/en/quantization/overview.md +++ b/docs/source/en/quantization/overview.md @@ -58,6 +58,7 @@ Use the table below to help you decide which quantization method to use. | [optimum-quanto](./quanto) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 🟢 | 2 / 4 / 8 | 🔴 | 🔴 | 🟢 | https://github.com/huggingface/optimum-quanto | | [FBGEMM_FP8](./fbgemm_fp8.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🔴 | 8 | 🔴 | 🟢 | 🟢 | https://github.com/pytorch/FBGEMM | | [torchao](./torchao.md) | 🟢 | | 🟢 | 🔴 | partial support (int4 weight only) | 🔴 | | 4 / 8 | | 🟢🔴 | 🟢 | https://github.com/pytorch/ao | +| [VPTQ](./vptq) | 🔴 | 🔴 | 🟢 | 🟡 | 🔴 | 🔴 | 🟢 | 1 - 8 | 🔴 | 🟢 | 🟢 | https://github.com/microsoft/VPTQ | @@ -71,4 +72,4 @@ We value your feedback to help identify bugs before the full release! Check out \** bitsandbytes is seeking contributors to help develop and lead the Apple Silicon backend. Interested? Contact them directly via their repo. Stipends may be available through sponsorships. - + \ No newline at end of file diff --git a/docs/source/en/quantization/vptq.md b/docs/source/en/quantization/vptq.md new file mode 100644 index 00000000000000..b86e82f0a3503d --- /dev/null +++ b/docs/source/en/quantization/vptq.md @@ -0,0 +1,111 @@ + + +# VPTQ + +> [!TIP] +> Try VPTQ on [Hugging Face](https://huggingface.co/spaces/microsoft/VPTQ)! +> Try VPTQ on [Google Colab](https://colab.research.google.com/github/microsoft/VPTQ/blob/main/notebooks/vptq_example.ipynb)! +> Know more about VPTQ on [ArXiv](https://arxiv.org/pdf/2409.17066)! + +Vector Post-Training Quantization ([VPTQ](https://github.com/microsoft/VPTQ)) is a novel Post-Training Quantization method that leverages Vector Quantization to high accuracy on LLMs at an extremely low bit-width (<2-bit). VPTQ can compress 70B, even the 405B model, to 1-2 bits without retraining and maintain high accuracy. + +- Better Accuracy on 1-2 bits, (405B @ <2bit, 70B @ 2bit) +- Lightweight Quantization Algorithm: only cost ~17 hours to quantize 405B Llama-3.1 +- Agile Quantization Inference: low decode overhead, best throughput, and TTFT + +Inference support for VPTQ is released in the `vptq` library. Make sure to install it to run the models: +```bash +pip install vptq +``` + +The library provides efficient kernels for NVIDIA/AMD GPU inference. + +To run VPTQ models simply load a model that has been quantized with VPTQ: + +## Inference example +**Run Llama 3.1 70b on RTX4090 (24G @ ~2bits) in real time** +![Llama3 1-70b-prompt](https://github.com/user-attachments/assets/d8729aca-4e1d-4fe1-ac71-c14da4bdd97f) + + +```python +from transformers import AutoTokenizer, AutoModelForCausalLM + +quantized_model = AutoModelForCausalLM.from_pretrained( + "VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-65536-woft", + torch_dtype="auto", + device_map="auto" +) +tokenizer = AutoTokenizer.from_pretrained("VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-65536-woft") +input_ids = tokenizer("hello, it's me", return_tensors="pt").to("cuda") +out = model.generate(**input_ids, max_new_tokens=32, do_sample=False) +``` + +## Quantize your own model +VPTQ algorithm early-released at [VPTQ ](https://github.com/microsoft/VPTQ/tree/algorithm), +and checkout the [tutorial](https://github.com/microsoft/VPTQ/blob/algorithm/algorithm.md). + +## Early Results from Tech Report +VPTQ achieves better accuracy and higher throughput with lower quantization overhead across models of different sizes. The following experimental results are for reference only; VPTQ can achieve better outcomes under reasonable parameters, especially in terms of model accuracy and inference speed. + + +| Model | bitwidth | W2↓ | C4↓ | AvgQA↑ | tok/s↑ | mem(GB) | cost/h↓ | +| ----------- | -------- | ---- | ---- | ------ | ------ | ------- | ------- | +| LLaMA-2 7B | 2.02 | 6.13 | 8.07 | 58.2 | 39.9 | 2.28 | 2 | +| | 2.26 | 5.95 | 7.87 | 59.4 | 35.7 | 2.48 | 3.1 | +| LLaMA-2 13B | 2.02 | 5.32 | 7.15 | 62.4 | 26.9 | 4.03 | 3.2 | +| | 2.18 | 5.28 | 7.04 | 63.1 | 18.5 | 4.31 | 3.6 | +| LLaMA-2 70B | 2.07 | 3.93 | 5.72 | 68.6 | 9.7 | 19.54 | 19 | +| | 2.11 | 3.92 | 5.71 | 68.7 | 9.7 | 20.01 | 19 | + + + +## More Models in [VPTQ-community](https://huggingface.co/VPTQ-community) + +⚠️ The repository only provides a method of model quantization algorithm. + +⚠️ The open-source community VPTQ-community provides models based on the technical report and quantization algorithm. + + + +**Quick Estimation of Model Bitwidth (Excluding Codebook Overhead)**: + +- **Model Naming Convention**: The model's name includes the **vector length** $v$, **codebook (lookup table) size**, and **residual codebook size**. For example, "Meta-Llama-3.1-70B-Instruct-v8-k65536-256-woft" is "Meta-Llama-3.1-70B-Instruct", where: + - **Vector Length**: 8 + - **Number of Centroids**: 65536 (2^16) + - **Number of Residual Centroids**: 256 (2^8) +- **Equivalent Bitwidth Calculation**: + - **Index**: log2(65536) = 16 / 8 = 2 bits + - **Residual Index**: log2(256) = 8 / 8 = 1 bit + - **Total Bitwidth**: 2 + 1 = 3 bits +- **Model Size Estimation**: 70B * 3 bits / 8 bits per Byte = 26.25 GB + +- **Note**: This estimate does not include the size of the codebook (lookup table), other parameter overheads, and the padding overhead for storing indices. For the detailed calculation method, please refer to **Tech Report Appendix C.2**. + + +| Model Series | Collections | (Estimated) Bit per weight | +| :--------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------: | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Llama 3.1 Nemotron 70B Instruct HF | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-nemotron-70b-instruct-hf-without-finetune-671730b96f16208d0b3fe942) | [4 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v8-k65536-0-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-16384-woft) [1.625 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-1024-woft) [1.5 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-256-woft) | +| Llama 3.1 8B Instruct | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-8b-instruct-without-finetune-66f2b70b1d002ceedef02d2e) | [4 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-65536-woft) [3.5 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-4096-woft) [3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-256-woft) [2.3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v12-k65536-4096-woft) | +| Llama 3.1 70B Instruct | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-70b-instruct-without-finetune-66f2bf454d3dd78dfee2ff11) | [4 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-256-woft) [2.25 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-4-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-0-woft) [1.93 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-32768-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k32768-0-woft) [1.75 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k16384-0-woft) | +| Llama 3.1 405B Instruct | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-405b-instruct-without-finetune-66f4413f9ba55e1a9e52cfb0) | [4 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v8-k65536-256-woft) [2 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-65536-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k32768-32768-woft) [1.625 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-1024-woft) [1.5 bits (1)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v8-k4096-0-woft) [1.5 bits (2)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-256-woft) [1.43 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-128-woft) [1.375 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-64-woft) | +| Mistral Large Instruct 2407 (123B) | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-mistral-large-instruct-2407-without-finetune-6711ebfb7faf85eed9cceb16) | [4 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v8-k65536-0-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-16384-woft) [1.75 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-4096-woft) [1.625 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-1024-woft) [1.5 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-256-woft) | +| Qwen 2.5 7B Instruct | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-7b-instruct-without-finetune-66f3e9866d3167cc05ce954a) | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k256-256-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k65536-0-woft) [2 bits (3)](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v16-k65536-65536-woft) | +| Qwen 2.5 14B Instruct | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-14b-instruct-without-finetune-66f827f83c7ffa7931b8376c) | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k256-256-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k65536-0-woft) [2 bits (3)](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v16-k65536-65536-woft) | +| Qwen 2.5 32B Instruct | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-32b-instruct-without-finetune-66fe77173bf7d64139f0f613) | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-0-woft) [2 bits (3)](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k256-256-woft) | +| Qwen 2.5 72B Instruct | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-72b-instruct-without-finetune-66f3bf1b3757dfa1ecb481c0) | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-256-woft) [2.38 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k1024-512-woft) [2.25 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k512-512-woft) [2.25 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-4-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-0-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v16-k65536-65536-woft) [1.94 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v16-k65536-32768-woft) | +| Reproduced from the tech report | [HF 🤗](https://huggingface.co/collections/VPTQ-community/reproduced-vptq-tech-report-baseline-66fbf1dffe741cc9e93ecf04) | Results from the open source community for reference only, please use them responsibly. | +| Hessian and Inverse Hessian Matrix | [HF 🤗](https://huggingface.co/collections/VPTQ-community/hessian-and-invhessian-checkpoints-66fd249a104850d17b23fd8b) | Collected from RedPajama-Data-1T-Sample, following [Quip#](https://github.com/Cornell-RelaxML/quip-sharp/blob/main/quantize_llama/hessian_offline_llama.py) \ No newline at end of file diff --git a/docs/source/en/tasks/audio_classification.md b/docs/source/en/tasks/audio_classification.md index 138fed6a1c0d1d..973f95e1e9555d 100644 --- a/docs/source/en/tasks/audio_classification.md +++ b/docs/source/en/tasks/audio_classification.md @@ -9,7 +9,7 @@ Unless required by applicable law or agreed to in writing, software distributed an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be +⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be rendered properly in your Markdown viewer. --> @@ -20,7 +20,7 @@ rendered properly in your Markdown viewer. -Audio classification - just like with text - assigns a class label output from the input data. The only difference is instead of text inputs, you have raw audio waveforms. Some practical applications of audio classification include identifying speaker intent, language classification, and even animal species by their sounds. +Audio classification - just like with text - assigns a class label as output from the input data. The only difference is instead of text inputs, you have raw audio waveforms. Some practical applications of audio classification include identifying speaker intent, language classification, and even animal species by their sounds. This guide will show you how to: @@ -57,7 +57,7 @@ Start by loading the MInDS-14 dataset from the 🤗 Datasets library: >>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train") ``` -Split the dataset's `train` split into a smaller train and test set with the [`~datasets.Dataset.train_test_split`] method. This'll give you a chance to experiment and make sure everything works before spending more time on the full dataset. +Split the dataset's `train` split into a smaller train and test set with the [`~datasets.Dataset.train_test_split`] method. This will give you a chance to experiment and make sure everything works before spending more time on the full dataset. ```py >>> minds = minds.train_test_split(test_size=0.2) @@ -79,13 +79,13 @@ DatasetDict({ }) ``` -While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, you'll focus on the `audio` and `intent_class` in this guide. Remove the other columns with the [`~datasets.Dataset.remove_columns`] method: +While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, you will focus on the `audio` and `intent_class` in this guide. Remove the other columns with the [`~datasets.Dataset.remove_columns`] method: ```py >>> minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"]) ``` -Take a look at an example now: +Here's an example: ```py >>> minds["train"][0] @@ -155,7 +155,7 @@ Now create a preprocessing function that: ... return inputs ``` -To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up `map` by setting `batched=True` to process multiple elements of the dataset at once. Remove the columns you don't need, and rename `intent_class` to `label` because that's the name the model expects: +To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up `map` by setting `batched=True` to process multiple elements of the dataset at once. Remove unnecessary columns and rename `intent_class` to `label`, as required by the model: ```py >>> encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True) @@ -260,7 +260,7 @@ For a more in-depth example of how to fine-tune a model for audio classification Great, now that you've fine-tuned a model, you can use it for inference! -Load an audio file you'd like to run inference on. Remember to resample the sampling rate of the audio file to match the sampling rate of the model if you need to! +Load an audio file for inference. Remember to resample the sampling rate of the audio file to match the model's sampling rate, if necessary. ```py >>> from datasets import load_dataset, Audio diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index 7e9567769cca1a..54740610ee1148 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -151,6 +151,8 @@ title: AWQ - local: in_translation title: (번역중) AQLM + - local: in_translation + title: (번역중) VPTQ - local: in_translation title: (번역중) Quanto - local: in_translation @@ -173,6 +175,8 @@ title: (번역중) AWQ - local: in_translation title: (번역중) AQLM + - local: in_translation + title: (번역중) VPTQ - local: quantization/quanto title: Quanto - local: quantization/eetq diff --git a/docs/source/ko/llm_optims.md b/docs/source/ko/llm_optims.md index 656ed53584c226..99eabc19ce860a 100644 --- a/docs/source/ko/llm_optims.md +++ b/docs/source/ko/llm_optims.md @@ -375,7 +375,7 @@ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable 양자화는 LLM 가중치를 더 낮은 정밀도로 저장하여 크기를 줄입니다. 이는 메모리 사용량을 줄이며 GPU 메모리에 제약이 있는 경우 추론을 위해 LLM을 로드하는 것을 더 용이하게 합니다. GPU가 충분하다면, 모델을 양자화할 필요는 없습니다. 추가적인 양자화 및 양자화 해제 단계로 인해 약간의 지연이 발생할 수 있기 때문입니다(AWQ 및 융합 AWQ 모듈 제외). > [!TIP] -> 다양한 양자화 라이브러리(자세한 내용은 [Quantization](./quantization) 가이드를 참조하십시오)가 있습니다. 여기에는 Quanto, AQLM, AWQ 및 AutoGPTQ가 포함됩니다. 사용 사례에 가장 잘 맞는 라이브러리를 사용해 보십시오. 또한 AutoGPTQ와 bitsandbytes를 비교하는 [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) 블로그 게시물을 읽어보는 것을 추천합니다. +> 다양한 양자화 라이브러리(자세한 내용은 [Quantization](./quantization) 가이드를 참조하십시오)가 있습니다. 여기에는 Quanto, AQLM, VPTQ, AWQ 및 AutoGPTQ가 포함됩니다. 사용 사례에 가장 잘 맞는 라이브러리를 사용해 보십시오. 또한 AutoGPTQ와 bitsandbytes를 비교하는 [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) 블로그 게시물을 읽어보는 것을 추천합니다. 아래의 모델 메모리 계산기를 사용하여 모델을 로드하는 데 필요한 메모리를 추정하고 비교해 보십시오. 예를 들어 [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)를 로드하는 데 필요한 메모리를 추정해 보십시오. diff --git a/docs/source/ko/main_classes/quantization.md b/docs/source/ko/main_classes/quantization.md index b1d1730d28d00b..6f793f22107417 100644 --- a/docs/source/ko/main_classes/quantization.md +++ b/docs/source/ko/main_classes/quantization.md @@ -35,6 +35,10 @@ Transformers에서 지원되지 않는 양자화 기법들은 [`HfQuantizer`] [[autodoc]] AqlmConfig +## VptqConfig[[transformers.VptqConfig]] + +[[autodoc]] VptqConfig + ## AwqConfig[[transformers.AwqConfig]] [[autodoc]] AwqConfig diff --git a/i18n/README_ar.md b/i18n/README_ar.md index 8160ec908d4411..c7249ac23d2e7f 100644 --- a/i18n/README_ar.md +++ b/i18n/README_ar.md @@ -245,7 +245,7 @@ limitations under the License. ### باستخدام pip -تم اختبار هذا المستودع على Python 3.9+، Flax 0.4.1+، PyTorch 1.11+، و TensorFlow 2.6+. +تم اختبار هذا المستودع على Python 3.9+، Flax 0.4.1+، PyTorch 2.0+، و TensorFlow 2.6+. يجب تثبيت 🤗 Transformers في [بيئة افتراضية](https://docs.python.org/3/library/venv.html). إذا كنت غير معتاد على البيئات الافتراضية Python، فراجع [دليل المستخدم](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). diff --git a/i18n/README_de.md b/i18n/README_de.md index ccc9e6111a25f0..78447af41a7a82 100644 --- a/i18n/README_de.md +++ b/i18n/README_de.md @@ -246,7 +246,7 @@ Das Modell selbst ist ein reguläres [PyTorch `nn.Module`](https://pytorch.org/d ### Mit pip -Dieses Repository wurde mit Python 3.9+, Flax 0.4.1+, PyTorch 1.11+ und TensorFlow 2.6+ getestet. +Dieses Repository wurde mit Python 3.9+, Flax 0.4.1+, PyTorch 2.0+ und TensorFlow 2.6+ getestet. Sie sollten 🤗 Transformers in einer [virtuellen Umgebung](https://docs.python.org/3/library/venv.html) installieren. Wenn Sie mit virtuellen Python-Umgebungen nicht vertraut sind, schauen Sie sich den [Benutzerleitfaden](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) an. diff --git a/i18n/README_es.md b/i18n/README_es.md index 5d5ba1b3249785..57eb8117fc0d5d 100644 --- a/i18n/README_es.md +++ b/i18n/README_es.md @@ -222,7 +222,7 @@ El modelo en si es un [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.h ### Con pip -Este repositorio está probado en Python 3.9+, Flax 0.4.1+, PyTorch 1.11+ y TensorFlow 2.6+. +Este repositorio está probado en Python 3.9+, Flax 0.4.1+, PyTorch 2.0+ y TensorFlow 2.6+. Deberías instalar 🤗 Transformers en un [entorno virtual](https://docs.python.org/3/library/venv.html). Si no estas familiarizado con los entornos virtuales de Python, consulta la [guía de usuario](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). diff --git a/i18n/README_fr.md b/i18n/README_fr.md index 97b11166b301a1..02714d52bff39b 100644 --- a/i18n/README_fr.md +++ b/i18n/README_fr.md @@ -243,7 +243,7 @@ Le modèle lui-même est un module [`nn.Module` PyTorch](https://pytorch.org/doc ### Avec pip -Ce référentiel est testé sur Python 3.9+, Flax 0.4.1+, PyTorch 1.11+ et TensorFlow 2.6+. +Ce référentiel est testé sur Python 3.9+, Flax 0.4.1+, PyTorch 2.0+ et TensorFlow 2.6+. Vous devriez installer 🤗 Transformers dans un [environnement virtuel](https://docs.python.org/3/library/venv.html). Si vous n'êtes pas familier avec les environnements virtuels Python, consultez le [guide utilisateur](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). diff --git a/i18n/README_hd.md b/i18n/README_hd.md index 17efdd21eb04dc..1541e4df66fcbd 100644 --- a/i18n/README_hd.md +++ b/i18n/README_hd.md @@ -198,7 +198,7 @@ checkpoint: जाँच बिंदु ### पिप का उपयोग करना -इस रिपॉजिटरी का परीक्षण Python 3.9+, Flax 0.4.1+, PyTorch 1.11+ और TensorFlow 2.6+ के तहत किया गया है। +इस रिपॉजिटरी का परीक्षण Python 3.9+, Flax 0.4.1+, PyTorch 2.0+ और TensorFlow 2.6+ के तहत किया गया है। आप [वर्चुअल एनवायरनमेंट](https://docs.python.org/3/library/venv.html) में 🤗 ट्रांसफॉर्मर इंस्टॉल कर सकते हैं। यदि आप अभी तक पायथन के वर्चुअल एनवायरनमेंट से परिचित नहीं हैं, तो कृपया इसे [उपयोगकर्ता निर्देश](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) पढ़ें। diff --git a/i18n/README_ja.md b/i18n/README_ja.md index 3d417098ea314d..fc3d4ae945cefd 100644 --- a/i18n/README_ja.md +++ b/i18n/README_ja.md @@ -256,7 +256,7 @@ Hugging Faceチームによって作られた **[トランスフォーマーを ### pipにて -このリポジトリは、Python 3.9+, Flax 0.4.1+, PyTorch 1.11+, TensorFlow 2.6+ でテストされています。 +このリポジトリは、Python 3.9+, Flax 0.4.1+, PyTorch 2.0+, TensorFlow 2.6+ でテストされています。 🤗Transformersは[仮想環境](https://docs.python.org/3/library/venv.html)にインストールする必要があります。Pythonの仮想環境に慣れていない場合は、[ユーザーガイド](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)を確認してください。 diff --git a/i18n/README_ko.md b/i18n/README_ko.md index b9502db5dda845..6d6559398e4d17 100644 --- a/i18n/README_ko.md +++ b/i18n/README_ko.md @@ -242,7 +242,7 @@ Transformers에 달린 100,000개의 별을 축하하기 위해, 우리는 커 ### pip로 설치하기 -이 저장소는 Python 3.9+, Flax 0.4.1+, PyTorch 1.11+, TensorFlow 2.6+에서 테스트 되었습니다. +이 저장소는 Python 3.9+, Flax 0.4.1+, PyTorch 2.0+, TensorFlow 2.6+에서 테스트 되었습니다. [가상 환경](https://docs.python.org/3/library/venv.html)에 🤗 Transformers를 설치하세요. Python 가상 환경에 익숙하지 않다면, [사용자 가이드](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)를 확인하세요. diff --git a/i18n/README_pt-br.md b/i18n/README_pt-br.md index d9248f9a151c36..f865f1b6ed9ca5 100644 --- a/i18n/README_pt-br.md +++ b/i18n/README_pt-br.md @@ -253,7 +253,7 @@ O modelo em si é um [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.ht ### Com pip -Este repositório é testado no Python 3.9+, Flax 0.4.1+, PyTorch 1.11+ e TensorFlow 2.6+. +Este repositório é testado no Python 3.9+, Flax 0.4.1+, PyTorch 2.0+ e TensorFlow 2.6+. Você deve instalar o 🤗 Transformers em um [ambiente virtual](https://docs.python.org/3/library/venv.html). Se você não está familiarizado com ambientes virtuais em Python, confira o [guia do usuário](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). diff --git a/i18n/README_ru.md b/i18n/README_ru.md index a359b52d2ccc73..c153474f339000 100644 --- a/i18n/README_ru.md +++ b/i18n/README_ru.md @@ -244,7 +244,7 @@ Hugging Face Hub. Мы хотим, чтобы Transformers позволил ра ### С помощью pip -Данный репозиторий протестирован на Python 3.9+, Flax 0.4.1+, PyTorch 1.11+ и TensorFlow 2.6+. +Данный репозиторий протестирован на Python 3.9+, Flax 0.4.1+, PyTorch 2.0+ и TensorFlow 2.6+. Устанавливать 🤗 Transformers следует в [виртуальной среде](https://docs.python.org/3/library/venv.html). Если вы не знакомы с виртуальными средами Python, ознакомьтесь с [руководством пользователя](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). diff --git a/i18n/README_te.md b/i18n/README_te.md index a9795e9ca326aa..791ed6414f73d2 100644 --- a/i18n/README_te.md +++ b/i18n/README_te.md @@ -246,7 +246,7 @@ limitations under the License. ### పిప్ తో -ఈ రిపోజిటరీ పైథాన్ 3.9+, ఫ్లాక్స్ 0.4.1+, PyTorch 1.11+ మరియు TensorFlow 2.6+లో పరీక్షించబడింది. +ఈ రిపోజిటరీ పైథాన్ 3.9+, ఫ్లాక్స్ 0.4.1+, PyTorch 2.0+ మరియు TensorFlow 2.6+లో పరీక్షించబడింది. మీరు [వర్చువల్ వాతావరణం](https://docs.python.org/3/library/venv.html)లో 🤗 ట్రాన్స్‌ఫార్మర్‌లను ఇన్‌స్టాల్ చేయాలి. మీకు పైథాన్ వర్చువల్ పరిసరాల గురించి తెలియకుంటే, [యూజర్ గైడ్](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) చూడండి. diff --git a/i18n/README_ur.md b/i18n/README_ur.md index cc37b5cfc4223d..2d4d7745f68eaf 100644 --- a/i18n/README_ur.md +++ b/i18n/README_ur.md @@ -259,7 +259,7 @@ limitations under the License. #### ‏ pip کے ساتھ -یہ ریپوزٹری Python 3.9+، Flax 0.4.1+، PyTorch 1.11+، اور TensorFlow 2.6+ پر ٹیسٹ کی گئی ہے۔ +یہ ریپوزٹری Python 3.9+، Flax 0.4.1+، PyTorch 2.0+، اور TensorFlow 2.6+ پر ٹیسٹ کی گئی ہے۔ آپ کو 🤗 Transformers کو ایک [ورچوئل ماحول](https://docs.python.org/3/library/venv.html) میں انسٹال کرنا چاہیے۔ اگر آپ Python ورچوئل ماحول سے واقف نہیں ہیں، تو [یوزر گائیڈ](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) دیکھیں۔ diff --git a/i18n/README_vi.md b/i18n/README_vi.md index f523c282b680c4..4f7f67bfce90ff 100644 --- a/i18n/README_vi.md +++ b/i18n/README_vi.md @@ -245,7 +245,7 @@ Chính mô hình là một [Pytorch `nn.Module`](https://pytorch.org/docs/stable ### Sử dụng pip -Thư viện này được kiểm tra trên Python 3.9+, Flax 0.4.1+, PyTorch 1.11+ và TensorFlow 2.6+. +Thư viện này được kiểm tra trên Python 3.9+, Flax 0.4.1+, PyTorch 2.0+ và TensorFlow 2.6+. Bạn nên cài đặt 🤗 Transformers trong một [môi trường ảo Python](https://docs.python.org/3/library/venv.html). Nếu bạn chưa quen với môi trường ảo Python, hãy xem [hướng dẫn sử dụng](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). diff --git a/i18n/README_zh-hans.md b/i18n/README_zh-hans.md index c9ac0357f18f1b..b4d121df0d3200 100644 --- a/i18n/README_zh-hans.md +++ b/i18n/README_zh-hans.md @@ -198,7 +198,7 @@ checkpoint: 检查点 ### 使用 pip -这个仓库已在 Python 3.9+、Flax 0.4.1+、PyTorch 1.11+ 和 TensorFlow 2.6+ 下经过测试。 +这个仓库已在 Python 3.9+、Flax 0.4.1+、PyTorch 2.0+ 和 TensorFlow 2.6+ 下经过测试。 你可以在[虚拟环境](https://docs.python.org/3/library/venv.html)中安装 🤗 Transformers。如果你还不熟悉 Python 的虚拟环境,请阅此[用户说明](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)。 diff --git a/i18n/README_zh-hant.md b/i18n/README_zh-hant.md index 87c623ee84a61b..dcafd4958ed1d1 100644 --- a/i18n/README_zh-hant.md +++ b/i18n/README_zh-hant.md @@ -210,7 +210,7 @@ Tokenizer 為所有的預訓練模型提供了預處理,並可以直接轉換 ### 使用 pip -這個 Repository 已在 Python 3.9+、Flax 0.4.1+、PyTorch 1.11+ 和 TensorFlow 2.6+ 下經過測試。 +這個 Repository 已在 Python 3.9+、Flax 0.4.1+、PyTorch 2.0+ 和 TensorFlow 2.6+ 下經過測試。 你可以在[虛擬環境](https://docs.python.org/3/library/venv.html)中安裝 🤗 Transformers。如果你還不熟悉 Python 的虛擬環境,請閱此[使用者指引](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)。 diff --git a/setup.py b/setup.py index c2c0048d6913ec..a78bb20dd0a4b0 100644 --- a/setup.py +++ b/setup.py @@ -100,7 +100,7 @@ "av==9.2.0", # Latest version of PyAV (10.0.0) has issues with audio stream. "beautifulsoup4", "blobfile", - "codecarbon==1.2.0", + "codecarbon>=2.8.1", "cookiecutter==1.7.3", "dataclasses", "datasets!=2.5.0", @@ -148,6 +148,7 @@ "pyyaml>=5.1", "pydantic", "pytest>=7.2.0,<8.0.0", + "pytest-asyncio", "pytest-timeout", "pytest-xdist", "python>=3.9.0", @@ -319,6 +320,7 @@ def run(self): extras["testing"] = ( deps_list( "pytest", + "pytest-asyncio", "pytest-rich", "pytest-xdist", "timeout-decorator", diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 600d3d217fa8a9..5510ac6c8ad512 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -122,6 +122,7 @@ "feature_extraction_utils": ["BatchFeature", "FeatureExtractionMixin"], "file_utils": [], "generation": [ + "AsyncTextIteratorStreamer", "CompileConfig", "GenerationConfig", "TextIteratorStreamer", @@ -1000,6 +1001,7 @@ "HqqConfig", "QuantoConfig", "TorchAoConfig", + "VptqConfig", ], } @@ -5054,7 +5056,14 @@ from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin # Generation - from .generation import CompileConfig, GenerationConfig, TextIteratorStreamer, TextStreamer, WatermarkingConfig + from .generation import ( + AsyncTextIteratorStreamer, + CompileConfig, + GenerationConfig, + TextIteratorStreamer, + TextStreamer, + WatermarkingConfig, + ) from .hf_argparser import HfArgumentParser # Integrations @@ -6017,6 +6026,7 @@ HqqConfig, QuantoConfig, TorchAoConfig, + VptqConfig, ) try: diff --git a/src/transformers/convert_pytorch_checkpoint_to_tf2.py b/src/transformers/convert_pytorch_checkpoint_to_tf2.py index 3875879f0e056d..c3431ad5b2e0ac 100755 --- a/src/transformers/convert_pytorch_checkpoint_to_tf2.py +++ b/src/transformers/convert_pytorch_checkpoint_to_tf2.py @@ -106,7 +106,6 @@ XLMWithLMHeadModel, XLNetLMHeadModel, ) - from .pytorch_utils import is_torch_greater_or_equal_than_1_13 logging.set_verbosity_info() @@ -279,7 +278,7 @@ def convert_pt_checkpoint_to_tf( if compare_with_pt_model: tfo = tf_model(tf_model.dummy_inputs, training=False) # build the network - weights_only_kwarg = {"weights_only": True} if is_torch_greater_or_equal_than_1_13 else {} + weights_only_kwarg = {"weights_only": True} state_dict = torch.load( pytorch_checkpoint_path, map_location="cpu", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 85345cc8e5889d..6a737b805a456c 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -7,7 +7,7 @@ "av": "av==9.2.0", "beautifulsoup4": "beautifulsoup4", "blobfile": "blobfile", - "codecarbon": "codecarbon==1.2.0", + "codecarbon": "codecarbon>=2.8.1", "cookiecutter": "cookiecutter==1.7.3", "dataclasses": "dataclasses", "datasets": "datasets!=2.5.0", @@ -54,6 +54,7 @@ "pyyaml": "pyyaml>=5.1", "pydantic": "pydantic", "pytest": "pytest>=7.2.0,<8.0.0", + "pytest-asyncio": "pytest-asyncio", "pytest-timeout": "pytest-timeout", "pytest-xdist": "pytest-xdist", "python": "python>=3.9.0", diff --git a/src/transformers/generation/__init__.py b/src/transformers/generation/__init__.py index 59d970db15416f..d3eb10c1e6b355 100644 --- a/src/transformers/generation/__init__.py +++ b/src/transformers/generation/__init__.py @@ -26,7 +26,7 @@ "SynthIDTextWatermarkingConfig", "WatermarkingConfig", ], - "streamers": ["TextIteratorStreamer", "TextStreamer"], + "streamers": ["AsyncTextIteratorStreamer", "TextIteratorStreamer", "TextStreamer"], } try: @@ -199,7 +199,7 @@ SynthIDTextWatermarkingConfig, WatermarkingConfig, ) - from .streamers import TextIteratorStreamer, TextStreamer + from .streamers import AsyncTextIteratorStreamer, TextIteratorStreamer, TextStreamer try: if not is_torch_available(): diff --git a/src/transformers/generation/streamers.py b/src/transformers/generation/streamers.py index c75b43466af7a8..c78e259db38be8 100644 --- a/src/transformers/generation/streamers.py +++ b/src/transformers/generation/streamers.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import asyncio from queue import Queue from typing import TYPE_CHECKING, Optional @@ -225,3 +226,91 @@ def __next__(self): raise StopIteration() else: return value + + +class AsyncTextIteratorStreamer(TextStreamer): + """ + Streamer that stores print-ready text in a queue, to be used by a downstream application as an async iterator. + This is useful for applications that benefit from acessing the generated text asynchronously (e.g. in an + interactive Gradio demo). + + + + The API for the streamer classes is still under development and may change in the future. + + + + Parameters: + tokenizer (`AutoTokenizer`): + The tokenized used to decode the tokens. + skip_prompt (`bool`, *optional*, defaults to `False`): + Whether to skip the prompt to `.generate()` or not. Useful e.g. for chatbots. + timeout (`float`, *optional*): + The timeout for the text queue. If `None`, the queue will block indefinitely. Useful to handle exceptions + in `.generate()`, when it is called in a separate thread. + decode_kwargs (`dict`, *optional*): + Additional keyword arguments to pass to the tokenizer's `decode` method. + + Raises: + TimeoutError: If token generation time exceeds timeout value. + + Examples: + + ```python + >>> from transformers import AutoModelForCausalLM, AutoTokenizer, AsyncTextIteratorStreamer + >>> from threading import Thread + >>> import asyncio + + >>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2") + >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") + >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pt") + + >>> # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way. + >>> async def main(): + ... # Important: AsyncTextIteratorStreamer must be initialized inside a coroutine! + ... streamer = AsyncTextIteratorStreamer(tok) + ... generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=20) + ... thread = Thread(target=model.generate, kwargs=generation_kwargs) + ... thread.start() + ... generated_text = "" + ... async for new_text in streamer: + ... generated_text += new_text + >>> print(generated_text) + >>> asyncio.run(main()) + An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven, + ``` + """ + + def __init__( + self, tokenizer: "AutoTokenizer", skip_prompt: bool = False, timeout: float | None = None, **decode_kwargs + ): + super().__init__(tokenizer, skip_prompt, **decode_kwargs) + self.text_queue = asyncio.Queue() + self.stop_signal = None + self.timeout = timeout + self.loop = asyncio.get_running_loop() + self.has_asyncio_timeout = hasattr(asyncio, "timeout") + + def on_finalized_text(self, text: str, stream_end: bool = False): + """Put the new text in the queue. If the stream is ending, also put a stop signal in the queue.""" + self.loop.call_soon_threadsafe(self.text_queue.put_nowait, text) + if stream_end: + self.loop.call_soon_threadsafe(self.text_queue.put_nowait, self.stop_signal) + + def __aiter__(self): + return self + + async def __anext__(self): + try: + if self.has_asyncio_timeout: + async with asyncio.timeout(self.timeout): + value = await self.text_queue.get() + else: + value = await asyncio.wait_for(self.text_queue.get(), timeout=self.timeout) + except asyncio.TimeoutError: + raise TimeoutError() + else: + if value == self.stop_signal: + raise StopAsyncIteration() + else: + return value diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py index 093e0af29844e4..32c828cd6e5b44 100755 --- a/src/transformers/integrations/__init__.py +++ b/src/transformers/integrations/__init__.py @@ -105,6 +105,7 @@ ], "peft": ["PeftAdapterMixin"], "quanto": ["replace_with_quanto_layers"], + "vptq": ["replace_with_vptq_linear"], } try: @@ -207,6 +208,7 @@ ) from .peft import PeftAdapterMixin from .quanto import replace_with_quanto_layers + from .vptq import replace_with_vptq_linear try: if not is_torch_available(): diff --git a/src/transformers/integrations/flash_attention.py b/src/transformers/integrations/flash_attention.py index 1be223f8b079ba..b8407bc29c6a8a 100644 --- a/src/transformers/integrations/flash_attention.py +++ b/src/transformers/integrations/flash_attention.py @@ -29,23 +29,34 @@ def flash_attention_forward( key = key.transpose(1, 2) value = value.transpose(1, 2) + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (usually our RMSNorm modules handle it correctly) + target_dtype = None if query.dtype == torch.float32: - query = query.to(torch.float16) - key = key.to(torch.float16) - value = value.to(torch.float16) + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(module.config, "_pre_quantization_dtype"): + target_dtype = module.config._pre_quantization_dtype + else: + target_dtype = next(layer for layer in module.modules() if isinstance(layer, torch.nn.Linear)).weight.dtype attn_output = _flash_attention_forward( query, key, value, attention_mask, - seq_len, - module.is_causal, + query_length=seq_len, + is_causal=module.is_causal, dropout=dropout, softmax_scale=scaling, sliding_window=sliding_window, softcap=softcap, use_top_left_mask=_use_top_left_mask, + target_dtype=target_dtype, **kwargs, ) diff --git a/src/transformers/integrations/flex_attention.py b/src/transformers/integrations/flex_attention.py index eacfb2b568b55b..66ffc5638838cb 100644 --- a/src/transformers/integrations/flex_attention.py +++ b/src/transformers/integrations/flex_attention.py @@ -2,10 +2,10 @@ import torch -from ..utils import is_torch_greater_or_equal +from ..utils import is_torch_flex_attn_available -if is_torch_greater_or_equal("2.5"): +if is_torch_flex_attn_available(): from torch.nn.attention.flex_attention import flex_attention @@ -37,8 +37,12 @@ def causal_mod(score, b, h, q_idx, kv_idx): score_mod=causal_mod, enable_gqa=True, scale=scaling, + # Last time checked on PyTorch == 2.5.1: Flex Attention always computes the lse regardless. + # For simplification, we thus always return it as no additional computations are introduced. return_lse=True, ) + # lse is returned in float32 + attention_weights = attention_weights.to(value.dtype) attn_output = attn_output.transpose(1, 2).contiguous() return attn_output, attention_weights diff --git a/src/transformers/integrations/sdpa_attention.py b/src/transformers/integrations/sdpa_attention.py index 265260c9b79e4c..38701690bf7c2a 100644 --- a/src/transformers/integrations/sdpa_attention.py +++ b/src/transformers/integrations/sdpa_attention.py @@ -34,10 +34,14 @@ def sdpa_attention_forward( if attention_mask is not None: causal_mask = causal_mask[:, :, :, : key.shape[-2]] + # SDPA with memory-efficient backend is bugged with non-contiguous inputs and custom attn_mask for some torch versions + # Reference: https://github.com/pytorch/pytorch/issues/112577. query = query.contiguous() key = key.contiguous() value = value.contiguous() + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. if is_causal is None: is_causal = causal_mask is None and query.shape[2] > 1 diff --git a/src/transformers/integrations/vptq.py b/src/transformers/integrations/vptq.py new file mode 100644 index 00000000000000..aa435517e81ebe --- /dev/null +++ b/src/transformers/integrations/vptq.py @@ -0,0 +1,101 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"VPTQ (Vector Post-Training Quantization) integration file" + +import torch.nn as nn +from accelerate import init_empty_weights +from vptq import VQuantLinear + + +def replace_with_vptq_linear( + model, + quantization_config=None, + modules_to_not_convert=None, + current_key_name=None, + has_been_replaced=False, +): + """ + Public method that recursively replaces the Linear layers of the given model with VPTQ quantized layers. + `accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the + conversion has been successfull or not. + + Args: + model (`torch.nn.Module`): + The model to convert, can be any `torch.nn.Module` instance. + quantization_config (`VptqConfig`): + The quantization config object that contains the quantization parameters. + modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`): + Names of the modules to not convert in `VQuantLinear`. In practice we keep the `lm_head` in full precision + for numerical stability reasons. + current_key_name (`list`, *optional*): + A list that contains the current key name. This is used for recursion and should not be passed by the user. + has_been_replaced (`bool`, *optional*): + A boolean that indicates if the conversion has been successful or not. This is used for recursion and + should not be passed by the user. + """ + + modules_to_not_convert = ["lm_head"] if not modules_to_not_convert else modules_to_not_convert + + for name, module in model.named_children(): + if current_key_name is None: + current_key_name = [] + current_key_name.append(name) + layer_name = ".".join(current_key_name) + shared_layer_config = quantization_config.shared_layer_config + config_for_layers = quantization_config.config_for_layers + + if ( + isinstance(module, nn.Linear) + and layer_name not in modules_to_not_convert + and ((layer_name in config_for_layers) or (current_key_name[-1] in shared_layer_config)) + ): + layer_params = config_for_layers.get(layer_name, None) or shared_layer_config.get( + current_key_name[-1], None + ) + + with init_empty_weights(): + in_features = module.in_features + out_features = module.out_features + + model._modules[name] = VQuantLinear( + in_features, + out_features, + vector_lens=layer_params["vector_lens"], + num_centroids=layer_params["num_centroids"], + num_res_centroids=layer_params["num_res_centroids"], + group_num=layer_params["group_num"], + group_size=layer_params["group_size"], + outlier_size=layer_params["outlier_size"], + indices_as_float=layer_params["indices_as_float"], + enable_norm=layer_params["enable_norm"], + enable_perm=layer_params["enable_perm"], + is_indice_packed=True, + enable_proxy_error=False, + bias=module.bias is not None, + ) + has_been_replaced = True + + # Force requires grad to False to avoid unexpected errors + model._modules[name].requires_grad_(False) + if len(list(module.children())) > 0: + _, has_been_replaced = replace_with_vptq_linear( + module, + quantization_config=quantization_config, + modules_to_not_convert=modules_to_not_convert, + current_key_name=current_key_name, + has_been_replaced=has_been_replaced, + ) + # Remove the last key for recursion + current_key_name.pop(-1) + return model, has_been_replaced diff --git a/src/transformers/modeling_attn_mask_utils.py b/src/transformers/modeling_attn_mask_utils.py index 4319c021cb2bc3..09fc77e46b07ed 100755 --- a/src/transformers/modeling_attn_mask_utils.py +++ b/src/transformers/modeling_attn_mask_utils.py @@ -169,6 +169,10 @@ def _make_causal_mask( diagonal = past_key_values_length - sliding_window - 1 context_mask = torch.tril(torch.ones_like(mask, dtype=torch.bool), diagonal=diagonal) + # Recent changes in PyTorch prevent mutations on tensors converted with aten::_to_copy + # See https://github.com/pytorch/pytorch/issues/127571 + if is_torchdynamo_compiling(): + mask = mask.clone() mask.masked_fill_(context_mask, torch.finfo(dtype).min) return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) diff --git a/src/transformers/modeling_flax_pytorch_utils.py b/src/transformers/modeling_flax_pytorch_utils.py index 8bbd8587b683f4..8fbba8a1651364 100644 --- a/src/transformers/modeling_flax_pytorch_utils.py +++ b/src/transformers/modeling_flax_pytorch_utils.py @@ -63,8 +63,6 @@ def load_pytorch_checkpoint_in_flax_state_dict( else: try: import torch # noqa: F401 - - from .pytorch_utils import is_torch_greater_or_equal_than_1_13 # noqa: F401 except (ImportError, ModuleNotFoundError): logger.error( "Loading a PyTorch model in Flax, requires both PyTorch and Flax to be installed. Please see" @@ -73,7 +71,7 @@ def load_pytorch_checkpoint_in_flax_state_dict( ) raise - weights_only_kwarg = {"weights_only": True} if is_torch_greater_or_equal_than_1_13 else {} + weights_only_kwarg = {"weights_only": True} pt_state_dict = torch.load(pt_path, map_location="cpu", **weights_only_kwarg) logger.info(f"PyTorch checkpoint contains {sum(t.numel() for t in pt_state_dict.values()):,} parameters.") @@ -246,13 +244,11 @@ def convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model): def convert_pytorch_sharded_state_dict_to_flax(shard_filenames, flax_model): import torch - from .pytorch_utils import is_torch_greater_or_equal_than_1_13 - # Load the index flax_state_dict = {} for shard_file in shard_filenames: # load using msgpack utils - weights_only_kwarg = {"weights_only": True} if is_torch_greater_or_equal_than_1_13 else {} + weights_only_kwarg = {"weights_only": True} pt_state_dict = torch.load(shard_file, **weights_only_kwarg) weight_dtypes = {k: v.dtype for k, v in pt_state_dict.items()} pt_state_dict = { diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py index 7f1367481ade62..8ec24d6e1872ef 100644 --- a/src/transformers/modeling_tf_pytorch_utils.py +++ b/src/transformers/modeling_tf_pytorch_utils.py @@ -180,8 +180,6 @@ def load_pytorch_checkpoint_in_tf2_model( import tensorflow as tf # noqa: F401 import torch # noqa: F401 from safetensors.torch import load_file as safe_load_file # noqa: F401 - - from .pytorch_utils import is_torch_greater_or_equal_than_1_13 # noqa: F401 except ImportError: logger.error( "Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see " @@ -201,7 +199,7 @@ def load_pytorch_checkpoint_in_tf2_model( if pt_path.endswith(".safetensors"): state_dict = safe_load_file(pt_path) else: - weights_only_kwarg = {"weights_only": True} if is_torch_greater_or_equal_than_1_13 else {} + weights_only_kwarg = {"weights_only": True} state_dict = torch.load(pt_path, map_location="cpu", **weights_only_kwarg) pt_state_dict.update(state_dict) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 9dcd6d758ecbe7..a6d4a1cc5b54ed 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -54,7 +54,6 @@ apply_chunking_to_forward, find_pruneable_heads_and_indices, id_tensor_storage, - is_torch_greater_or_equal_than_1_13, prune_conv1d_layer, prune_layer, prune_linear_layer, @@ -476,7 +475,7 @@ def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True): error_message += f"\nMissing key(s): {str_unexpected_keys}." raise RuntimeError(error_message) - weights_only_kwarg = {"weights_only": True} if is_torch_greater_or_equal_than_1_13 else {} + weights_only_kwarg = {"weights_only": True} loader = safe_load_file if load_safe else partial(torch.load, map_location="cpu", **weights_only_kwarg) for shard_file in shard_files: @@ -532,7 +531,7 @@ def load_state_dict( and is_zipfile(checkpoint_file) ): extra_args = {"mmap": True} - weights_only_kwarg = {"weights_only": weights_only} if is_torch_greater_or_equal_than_1_13 else {} + weights_only_kwarg = {"weights_only": weights_only} return torch.load( checkpoint_file, map_location=map_location, @@ -1474,11 +1473,8 @@ def _autoset_attn_implementation( ) if not isinstance(config._attn_implementation, dict) and config._attn_implementation not in [ - "eager", - "sdpa", - "flash_attention_2", - "flex_attention", - ]: + "eager" + ] + list(ALL_ATTENTION_FUNCTIONS.keys()): message = f'Specified `attn_implementation="{config._attn_implementation}"` is not supported. The only possible arguments are `attn_implementation="eager"` (manual attention implementation)' if cls._supports_flash_attn_2: message += ', `"attn_implementation=flash_attention_2"` (implementation using flash attention 2)' @@ -1540,6 +1536,8 @@ def _autoset_attn_implementation( "Using the `SDPA` attention implementation on multi-gpu setup with ROCM may lead to performance issues due to the FA backend. Disabling it to use alternative backends." ) torch.backends.cuda.enable_flash_sdp(False) + elif requested_attn_implementation in list(ALL_ATTENTION_FUNCTIONS.keys()): + config._attn_implementation = requested_attn_implementation elif isinstance(requested_attn_implementation, dict): config._attn_implementation = None else: diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py index 03102d22ca0d77..801bd19fca3b60 100755 --- a/src/transformers/models/data2vec/modeling_data2vec_audio.py +++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py @@ -1421,7 +1421,8 @@ def forward( pooled_output = hidden_states.mean(dim=1) else: padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask) - hidden_states[~padding_mask] = 0.0 + expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_padding_mask] = 0.0 pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1) logits = self.classifier(pooled_output) diff --git a/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py index ca80636b23565d..53dec63cfc4fd8 100644 --- a/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py +++ b/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py @@ -222,7 +222,7 @@ def __init__( "from a PyTorch pretrained vocabulary, " "or activate it with environment variables USE_TORCH=1 and USE_TF=0." ) - vocab_dict = torch.load(pretrained_vocab_file) + vocab_dict = torch.load(pretrained_vocab_file, weights_only=True) if vocab_dict is not None: for key, value in vocab_dict.items(): @@ -705,7 +705,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, # Instantiate tokenizer. corpus = cls(*inputs, **kwargs) - corpus_dict = torch.load(resolved_corpus_file) + corpus_dict = torch.load(resolved_corpus_file, weights_only=True) for key, value in corpus_dict.items(): corpus.__dict__[key] = value corpus.vocab = vocab @@ -784,7 +784,7 @@ def get_lm_corpus(datadir, dataset): fn_pickle = os.path.join(datadir, "cache.pkl") if os.path.exists(fn): logger.info("Loading cached dataset...") - corpus = torch.load(fn_pickle) + corpus = torch.load(fn_pickle, weights_only=True) elif os.path.exists(fn): logger.info("Loading cached dataset from pickle...") if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")): diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 8d5a224f4f6654..e0e4ff424cb47d 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -38,7 +38,6 @@ ) from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import is_torch_greater_or_equal_than_2_0 from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -815,14 +814,6 @@ def _init_weights(self, module: nn.Module): # Adapted from transformers.modeling_utils.PreTrainedModel._check_and_enable_sdpa @classmethod def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False) -> "PretrainedConfig": - # NOTE: Falcon supported SDPA from PyTorch 2.0. We keep it like that for backward compatibility (automatically use SDPA for torch>=2.0). - if hard_check_only: - if not is_torch_greater_or_equal_than_2_0: - raise ImportError("PyTorch SDPA requirements in Transformers are not met. Please install torch>=2.0.") - - if not is_torch_greater_or_equal_than_2_0: - return config - _is_bettertransformer = getattr(cls, "use_bettertransformer", False) if _is_bettertransformer: return config diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py index 6763695bfba036..ef23b5d208fd79 100755 --- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py +++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py @@ -36,7 +36,6 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import is_torch_greater_or_equal_than_1_13 from ...utils import ( add_code_sample_docstrings, add_start_docstrings, @@ -56,9 +55,6 @@ # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph. # It means that the function will not be traced through and simply appear as a node in the graph. if is_torch_fx_available(): - if not is_torch_greater_or_equal_than_1_13: - import torch.fx - _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask) diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index 1629f7d4f3feae..f2700836789ebd 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -1629,7 +1629,8 @@ def forward( pooled_output = hidden_states.mean(dim=1) else: padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask) - hidden_states[~padding_mask] = 0.0 + expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_padding_mask] = 0.0 pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1) logits = self.classifier(pooled_output) diff --git a/src/transformers/models/mamba2/modeling_mamba2.py b/src/transformers/models/mamba2/modeling_mamba2.py index c312b9b94351d2..550eeb7f9665e4 100644 --- a/src/transformers/models/mamba2/modeling_mamba2.py +++ b/src/transformers/models/mamba2/modeling_mamba2.py @@ -44,14 +44,22 @@ from mamba_ssm.ops.triton.selective_state_update import selective_state_update from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined else: - selective_state_update = None + mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined, selective_state_update = None, None, None if is_causal_conv1d_available(): from causal_conv1d import causal_conv1d_fn, causal_conv1d_update else: causal_conv1d_update, causal_conv1d_fn = None, None -is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update)) +is_fast_path_available = all( + ( + selective_state_update, + mamba_chunk_scan_combined, + mamba_split_conv1d_scan_combined, + causal_conv1d_fn, + causal_conv1d_update, + ) +) _CHECKPOINT_FOR_DOC = "mistralai/mamba-codestral-7B-v0.1" _CONFIG_FOR_DOC = "Mamba2Config" @@ -111,6 +119,17 @@ def segment_sum(input_tensor): return tensor_segsum +def apply_mask_to_padding_states(hidden_states, attention_mask): + """ + Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66 + """ + if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: + dtype = hidden_states.dtype + hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + + return hidden_states + + class Mamba2Cache: """ Arguments: @@ -120,51 +139,69 @@ class Mamba2Cache: device: torch.device Attributes: - seqlen_offset: int - dtype: torch.dtype - conv_states: Dict[int, torch.Tensor] # layer_idx -> [batch_size, intermediate_size, conv_kernel_size] - ssm_states: Dict[int, torch.Tensor] # layer_idx -> [batch_size, intermediate_size, ssm_state_size] + dtype: (`torch.dtype`): + The default `dtype` used to initializing the cache. + conv_kernel_size: (`int`): + Model's convolution kernel size taken from config. + n_groups: (`int`): + Model's number of groups taken from the config - similar to tensor parallel in Transformer. + state_size: (`int`): + Model's SSM state size taken from config. + num_heads: (`int`): + The number of heads used in the linear attention / SSM. + head_dim: (`int`): + The respective dimension of the heads used in the linear attention / SSM. + intermediate_size: (`int`): + Model's intermediate_size based on (expand * hidden_dim) from config. + conv_states: (`torch.Tensor`): + A tensor of shape `[num_layers, batch_size, conv_kernel_size, intermediate_size + 2 * n_groups * state_size]` that holds convolutional states. + ssm_states: (`torch.Tensor`): + A tensor of shape `[num_layers, batch_size, num_heads, head_dim, state_size]` that holds ssm states. """ def __init__( self, config: Mamba2Config, batch_size: int, dtype: torch.dtype = torch.float16, device: Optional[str] = None ): - self.seqlen_offset = 0 self.dtype = dtype self.conv_kernel_size = config.conv_kernel + self.n_groups = config.n_groups + self.state_size = config.state_size + self.num_heads = config.num_heads + self.head_dim = config.head_dim self.intermediate_size = int(config.expand * config.hidden_size) - self.conv_states = { - i: torch.zeros( - batch_size, - self.intermediate_size + 2 * config.n_groups * config.state_size, - self.conv_kernel_size, - device=device, - dtype=dtype, - ) - for i in range(config.num_hidden_layers) - } - self.ssm_states = { - i: torch.zeros( - batch_size, config.num_heads, config.head_dim, config.state_size, device=device, dtype=dtype - ) - for i in range(config.num_hidden_layers) - } - self.activation = config.hidden_act - self.act = ACT2FN[config.hidden_act] + self.conv_states = torch.zeros( + config.num_hidden_layers, + batch_size, + self.intermediate_size + 2 * self.n_groups * self.state_size, + self.conv_kernel_size, + device=device, + dtype=dtype, + ) + self.ssm_states = torch.zeros( + config.num_hidden_layers, + batch_size, + self.num_heads, + self.head_dim, + self.state_size, + device=device, + dtype=dtype, + ) def update_conv_state( - self, layer_idx: int, new_conv_state: torch.Tensor, cache_position: torch.LongTensor + self, layer_idx: int, new_conv_state: torch.Tensor, cache_init: bool = False ) -> torch.Tensor: - conv_state = self.conv_states[layer_idx] - cache_position = cache_position.clamp(0, self.conv_kernel_size - 1) - - conv_state = conv_state.roll(shifts=-1, dims=-1) - conv_state[:, :, cache_position] = new_conv_state.to(conv_state.device) - self.conv_states[layer_idx].zero_() - self.conv_states[layer_idx] += conv_state + if cache_init: + self.conv_states[layer_idx] = new_conv_state.to(self.conv_states.device) + else: + self.conv_states[layer_idx] = self.conv_states[layer_idx].roll(shifts=-1, dims=-1) + self.conv_states[layer_idx][:, :, -1] = new_conv_state[:, 0, :].to(self.conv_states.device) return self.conv_states[layer_idx] + def update_ssm_state(self, layer_idx: int, new_ssm_state: torch.Tensor): + self.ssm_states[layer_idx] = new_ssm_state.to(self.ssm_states.device) + return self.ssm_states[layer_idx] + def reset(self): self.conv_states.zero_() self.ssm_states.zero_() @@ -269,19 +306,27 @@ def cuda_kernels_forward( cache_position: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, ): - # set up dimensions for reshapes later + # 1. Gated MLP's linear projection + hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask) + projected_states = self.in_proj(hidden_states) + # Set up dimensions for reshapes later batch_size, seq_len, _ = hidden_states.shape groups_time_state_size = self.n_groups * self.ssm_state_size - d_to_remove = 2 * self.intermediate_size + 2 * self.n_groups * self.ssm_state_size + self.num_heads - - # getting projected states from cache if it exists - if cache_params is not None and cache_params.seqlen_offset > 0: - in_projected_states = self.in_proj(hidden_states.squeeze(1)) # (B 2D) - d_mlp = (in_projected_states.shape[-1] - d_to_remove) // 2 - split_projection_dim = [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads] - _, _, gate, hidden_states_B_C, dt = torch.split(in_projected_states, split_projection_dim, dim=-1) + d_mlp = ( + projected_states.shape[-1] + - 2 * self.intermediate_size + - 2 * self.n_groups * self.ssm_state_size + - self.num_heads + ) // 2 + + # Single step calculations via cache + if cache_params is not None and cache_position is not None and cache_position[0] > 0: + _, _, gate, hidden_states_B_C, dt = projected_states.squeeze(1).split( + [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 + ) + # 2. Convolution sequence transformation hidden_states_B_C = causal_conv1d_update( hidden_states_B_C, cache_params.conv_states[self.layer_idx], @@ -295,8 +340,9 @@ def cuda_kernels_forward( [self.intermediate_size, groups_time_state_size, groups_time_state_size], dim=-1, ) - A = -torch.exp(self.A_log.float()) # (nheads,) + # 3. SSM transformation + A = -torch.exp(self.A_log.float()) # (nheads,) A = A[:, None, ...][:, :, None].expand(-1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) dt = dt[:, :, None].expand(-1, -1, self.head_dim) dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim) @@ -318,20 +364,18 @@ def cuda_kernels_forward( ) hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim) hidden_states = self.norm(hidden_states, gate) + + # 4. Final linear projection out = self.out_proj(hidden_states)[:, None, ...] - # if no cache is found, calling the kernel + + # Fused calculations or step by step if no initialized cache is found else: - if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: - # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66 - dtype = hidden_states.dtype - hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) - # 1. Gated MLP's linear projection - projected_states = self.in_proj(hidden_states) A = -torch.exp(self.A_log.float()) # (num_heads) or (intermediate_size, state_size) dt_limit_kwargs = {} if self.time_step_limit == (0.0, float("inf")) else {"dt_limit": self.time_step_limit} + # 2-4. Fused kernel for conv1d, SSM, and the final projection if self.training and cache_params is None: - out, ssm_state = mamba_split_conv1d_scan_combined( + out = mamba_split_conv1d_scan_combined( projected_states, self.conv1d.weight.squeeze(1), self.conv1d.bias, @@ -348,41 +392,50 @@ def cuda_kernels_forward( headdim=self.head_dim, ngroups=self.n_groups, norm_before_gate=False, - return_final_states=True, + return_final_states=False, **dt_limit_kwargs, ) else: - gate, hidden_states_B_C, time_step = torch.split( - projected_states, - [self.intermediate_size, self.conv_dim, self.num_heads], - dim=-1, + _, _, gate, hidden_states_B_C, dt = projected_states.split( + [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 ) - # 1D Convolution - if causal_conv1d_fn is None or self.activation not in ["silu", "swish"]: + # 2. Convolution sequence transformation + # Init cache + if cache_params is not None: + hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2) + conv_states = nn.functional.pad( + hidden_states_B_C_transposed, + (cache_params.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0), + ) + cache_params.update_conv_state( + layer_idx=self.layer_idx, new_conv_state=conv_states, cache_init=True + ) + + if self.activation not in ["silu", "swish"]: hidden_states_B_C = self.act( - self.conv1d(hidden_states_B_C.transpose(1, 2)).transpose(1, 2)[:, :seq_len] - ) # (B, L, self.d_inner + 2 * ngroups * d_state) + self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2) + ) else: hidden_states_B_C = causal_conv1d_fn( x=hidden_states_B_C.transpose(1, 2), weight=self.conv1d.weight.squeeze(1), bias=self.conv1d.bias, activation=self.activation, - ).transpose(1, 2)[:, :seq_len] + ).transpose(1, 2) + + hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask) hidden_states, B, C = torch.split( hidden_states_B_C, [self.intermediate_size, groups_time_state_size, groups_time_state_size], dim=-1, ) - if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: - # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66 - dtype = hidden_states.dtype - hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + + # 3. SSM transformation scan_output, ssm_state = mamba_chunk_scan_combined( hidden_states.view(batch_size, seq_len, -1, self.head_dim), - time_step, + dt, A, B.view(batch_size, seq_len, self.n_groups, -1), C.view(batch_size, seq_len, self.n_groups, -1), @@ -395,11 +448,16 @@ def cuda_kernels_forward( dt_softplus=True, **dt_limit_kwargs, ) + + # Init cache if ssm_state is not None and cache_params is not None: - cache_params.ssm_states[self.layer_idx].copy_(ssm_state) + cache_params.update_ssm_state(layer_idx=self.layer_idx, new_ssm_state=ssm_state) + scan_output = scan_output.view(batch_size, seq_len, -1) # Multiply "gate" branch and apply extra normalization layer scan_output = self.norm(scan_output, gate) + + # 4. Final linear projection out = self.out_proj(scan_output) return out @@ -407,60 +465,64 @@ def cuda_kernels_forward( def torch_forward(self, input_states, cache_params: Optional[Mamba2Cache]=None, cache_position:Optional[torch.LongTensor]=None, attention_mask: Optional[torch.Tensor]=None): batch_size, seq_len, _ = input_states.shape dtype = input_states.dtype - # Gated MLP's linear projection - projected_states = self.in_proj(input_states.squeeze(1)) - d_mlp = (projected_states.shape[-1] - 2 * self.intermediate_size - 2 * self.n_groups * self.ssm_state_size- self.num_heads) // 2 - _, _, gate, hidden_states, dt = projected_states.split( + + # 1. Gated MLP's linear projection + input_states = apply_mask_to_padding_states(input_states, attention_mask) + projected_states = self.in_proj(input_states) + d_mlp = (projected_states.shape[-1] - 2 * self.intermediate_size - 2 * self.n_groups * self.ssm_state_size-self.num_heads) // 2 + _, _, gate, hidden_states_B_C, dt = projected_states.split( [d_mlp, d_mlp, self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 ) - # Convolution sequence transformation - if cache_params is not None: - ssm_state = cache_params.ssm_states[self.layer_idx].clone() - ssm_state = ssm_state.to(hidden_states.device) - if cache_params.seqlen_offset > 0: - conv_state = cache_params.conv_states[self.layer_idx] # [batch, intermediate_size, conv_kernel_size] - conv_state = torch.roll(conv_state, shifts=-1, dims=-1) - # handle batched generation - states are copied through - conv_state[:, :, -1] = hidden_states[:, 0, :] if hidden_states.ndim == 3 else hidden_states - cache_params.conv_states[self.layer_idx].copy_(conv_state) - hidden_states = torch.sum(conv_state.to(projected_states.device) * self.conv1d.weight[:, 0, :], dim=-1) - if self.use_conv_bias: - hidden_states += self.conv1d.bias - hidden_states = self.act(hidden_states).to(dtype)[:, None, ...] # [batch, 1, intermediate_size] : decoding - else: - hidden_states = hidden_states.transpose(1,2) - conv_state = nn.functional.pad( - hidden_states, - (self.conv_kernel_size - hidden_states.shape[-1], 0) - ) - cache_params.conv_states[self.layer_idx].copy_(conv_state) - hidden_states = self.act(self.conv1d(hidden_states).transpose(1,2))[:, :seq_len, :] # [batch, intermediate_size, seq_len] - if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: - dtype = hidden_states.dtype - # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66 - hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) - else: - ssm_state = torch.zeros( - (batch_size, self.num_heads, self.head_dim, self.ssm_state_size), - device=hidden_states.device, dtype=dtype + # 2. Convolution sequence transformation + if cache_params is not None and cache_position is not None and cache_position[0] > 0: + cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=hidden_states_B_C, cache_init=False) + + # We need to guarantee that anything regarding the cache is on the same device + conv_states = cache_params.conv_states[self.layer_idx].to(device=self.conv1d.weight.device) + + hidden_states_B_C = torch.sum( + conv_states * self.conv1d.weight.squeeze(1), dim=-1 ) - hidden_states = self.act(self.conv1d(hidden_states.transpose(1, 2))[..., :seq_len].transpose(1, 2)) - hidden_states, B, C = torch.split(hidden_states, [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size], dim=-1) + if self.use_conv_bias: + hidden_states_B_C = hidden_states_B_C + self.conv1d.bias + hidden_states_B_C = self.act(hidden_states_B_C) + else: + # Init cache + if cache_params is not None: + hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2) + conv_states = nn.functional.pad( + hidden_states_B_C_transposed, (cache_params.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0) + ) + cache_params.update_conv_state(layer_idx=self.layer_idx, new_conv_state=conv_states, cache_init=True) + + hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2)) + + hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask) + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size], + dim=-1 + ) + + # 3. SSM transformation A = -torch.exp(self.A_log.float()) # [num_heads] - if cache_params is not None and cache_params.seqlen_offset > 0: + if cache_params is not None and cache_position is not None and cache_position[0] > 0: + # We need to guarantee that anything regarding the cache is on the same device + cache_device = cache_params.ssm_states.device + # Note: there is no need to pad parameter matrices here, as there is just one new token # for batched generation - dt = dt[:, None, ...] if dt.ndim == 2 else dt[:, 0, :][:, None, ...] + dt = dt[:, 0, :][:, None, ...] dt = dt.transpose(1, 2).expand(batch_size, dt.shape[-1], self.head_dim) # [num_heads] -> [num_heads, head_dim] dt_bias = self.dt_bias[..., None].expand(self.dt_bias.shape[0], self.head_dim) dt = torch.nn.functional.softplus(dt + dt_bias.to(dt.dtype)) - dt = torch.clamp(dt, self.time_step_min) #, self.time_step_max) + dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) A = A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) # [bsz, num_heads, head_dim, state_size] - dA = torch.exp(dt[..., None] * A) + dA = (torch.exp(dt[..., None] * A)).to(device=cache_device) # Discretize B # [bsz, n_groups * state_size] -> [bsz, n_groups, 1, state_size] -> @@ -474,11 +536,12 @@ def torch_forward(self, input_states, cache_params: Optional[Mamba2Cache]=None, # Discretize x into dB # [bsz, intermediate_size] -> [bsz, num_heads, head_dim] hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim) - dBx = dB * hidden_states[..., None] + dBx = (dB * hidden_states[..., None]).to(device=cache_device) # State calculation - cache_params.ssm_states[self.layer_idx].copy_( - cache_params.ssm_states[self.layer_idx] * dA + dBx + cache_params.update_ssm_state( + layer_idx=self.layer_idx, + new_ssm_state=cache_params.ssm_states[self.layer_idx] * dA + dBx ) # Subsequent output @@ -488,7 +551,7 @@ def torch_forward(self, input_states, cache_params: Optional[Mamba2Cache]=None, C = C.reshape(batch_size, -1, C.shape[-1]) # [bsz, num_heads, head_dim] - ssm_states = cache_params.ssm_states[self.layer_idx].to(C.dtype) # Shape: [b, h, d, n] + ssm_states = cache_params.ssm_states[self.layer_idx].to(device=C.device, dtype=C.dtype) # Shape: [b, h, d, n] # Reshape ssm_states to merge the first two dimensions ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size) # Shape: [b*h, d, n] C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1) # Shape: [b*h, n, 1] @@ -505,9 +568,9 @@ def torch_forward(self, input_states, cache_params: Optional[Mamba2Cache]=None, else: # begin ssd naive implementation without einsums dt = nn.functional.softplus(dt + self.dt_bias) - dt = torch.clamp(dt, self.time_step_min) + dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float() - B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() + B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() B = B.repeat(1, 1, self.num_heads // self.n_groups, 1) C = C.repeat(1, 1, self.num_heads // self.n_groups, 1) @@ -522,7 +585,6 @@ def torch_forward(self, input_states, cache_params: Optional[Mamba2Cache]=None, # Rearrange into blocks/chunks hidden_states, A, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A, B, C)] - # [bsz, -1, chunk_size, num_heads] -> [bsz, num_heads, -1, chunk_size] A = A.permute(0, 3, 1, 2) A_cumsum = torch.cumsum(A, dim=-1) @@ -531,45 +593,43 @@ def torch_forward(self, input_states, cache_params: Optional[Mamba2Cache]=None, # This is the analog of a causal mask L = torch.exp(segment_sum(A)) - # First, contraction of C and B to get G (attention-weights like) - G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, : ,:] # shape: (b, c, l, s, h, n) + # Contraction of C and B to get G (attention-weights like) + G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :] # shape: (b, c, l, s, h, n) G = G_intermediate.sum(dim=-1) # shape: (b, c, l, s, h) - - # Step 2: Compute M, equivalent to applying attention mask to weights + # Compute M, equivalent to applying attention mask to weights M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None] M = M_intermediate.sum(dim=-1) - # Step 3: Compute Y_diag (apply to values) - Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(3) + # Compute Y_diag (apply to values) + Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(dim=3) + # 2. Compute the state for each intra-chunk # (right term of low-rank factorization of off-diagonal blocks; B terms) - decay_states = torch.exp((A_cumsum[:, :, :, -1:] - A_cumsum)) - B_decay_contraction = B * decay_states.permute(0, 2, 3, 1)[..., None] - # permute back B * decay states - states = (B_decay_contraction.permute(0, 1, 3, 2, 4)[..., None] * hidden_states.permute(0, 1, 3, 2, 4)[..., None, :]).sum(dim=3).permute(0, 1, 2, 4, 3) - if cache_params is not None and cache_params.seqlen_offset > 0: - previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...] + B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None] + states = (B_decay[..., None, :] * hidden_states[..., None]).sum(dim=2) + + # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries + # (middle term of factorization of off-diag blocks; A terms) + if cache_params is not None and cache_position is not None and cache_position[0] > 0: + previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...].to(device=states.device) else: previous_states = torch.zeros_like(states[:, :1]) states = torch.cat([previous_states, states], dim=1) decay_chunk = torch.exp(segment_sum(nn.functional.pad(A_cumsum[:, :, :, -1], (1, 0)))) - - states_permuted = states.permute(0, 2, 1, 3, 4) - result = (decay_chunk[..., None, None] * states_permuted[:, :, None, ...]).sum(dim=2) - new_states = result.permute(0, 2, 1, 3, 4) + decay_chunk = decay_chunk.transpose(1, 3) + new_states = (decay_chunk[..., None, None] * states[:, :, None, ...]).sum(dim=1) states, ssm_state = new_states[:, :-1], new_states[:, -1] - # Compute state -> output conversion per chunk + # 4. Compute state -> output conversion per chunk # (left term of low-rank factorization of off-diagonal blocks; C terms) state_decay_out = torch.exp(A_cumsum) - # compute Yoff C_times_states = (C[..., None, :] * states[:, :, None, ...]) state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1) Y_off = (C_times_states.sum(-1) * state_decay_out_permuted[..., None]) - # Add output of intra-chunk and inter-chunk terms (diagonal and off-diagonal blocks) + # Add output of intra-chunk and inter-chunk terms (diagonal and off-diagonal blocks) y = Y_diag + Y_off # [bsz, -1, self.chunk_size, num_heads, head_dim] -> [bsz, (padded) seq_len, num_heads, head_dim] y = y.reshape(batch_size, -1, self.num_heads, self.head_dim) @@ -579,8 +639,10 @@ def torch_forward(self, input_states, cache_params: Optional[Mamba2Cache]=None, if pad_size > 0: y = y[:, :seq_len, :, :] y = y.reshape(batch_size, seq_len, -1) + + # Init cache if ssm_state is not None and cache_params is not None: - cache_params.ssm_states[self.layer_idx].copy_(ssm_state) + cache_params.update_ssm_state(layer_idx=self.layer_idx, new_ssm_state=ssm_state) scan_output = self.norm(y, gate) @@ -916,9 +978,6 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if use_cache: - cache_params.seqlen_offset += inputs_embeds.shape[1] - hidden_states = self.norm_f(hidden_states) if output_hidden_states: @@ -975,10 +1034,6 @@ def prepare_inputs_for_generation( ): # Overwitten -- uses `cache_params` as opposed to `past_key_values` - if inputs_embeds is not None: - past_len = inputs_embeds.shape[1] + input_ids.shape[1] - else: - past_len = input_ids.shape[1] if use_cache: # `cache_position` should have been initialized in `generate` if cache_position is None: @@ -987,33 +1042,18 @@ def prepare_inputs_for_generation( "`model.generate`, you are responsible for passing in a valid `cache_position` if " "you are calling `prepare_inputs_for_generation` directly with `use_cache=True`" ) - # how do we detect that we are in decoding without cache? if cache_position[0] > 0: input_ids = input_ids[:, -1][..., None] - attention_mask = attention_mask[:, -1][..., None] + + if attention_mask is not None: + attention_mask = None else: # we initialize the `cache_position` to full size of `conv_states` at prefill stage # considering padding will be applied when input length is shorter, and truncation # will be applied when it is longer, so it will be equivalent to always have it match # the length of `cache_params.conv_states`, which is `config.conv_kernel` - cache_position = torch.arange(0, past_len, device=input_ids.device) - # if the cache is not used, we also do have to extend the attention mask here - # TODO there is likely a cleverer way to do this - extended_mask = torch.ones( - attention_mask.size(0), past_len - attention_mask.shape[1], device=attention_mask.device - ) - attention_mask = torch.cat([attention_mask, extended_mask], dim=1) - cache_params = None - - if attention_mask.shape[1] < past_len: - # we have to update manually the attention mask if - # we are in decoding without cache - # and we don't have position_ids here - # TODO but we should be able to use cache_position though at a later time - extended_mask = torch.ones( - attention_mask.size(0), past_len - attention_mask.shape[1], device=attention_mask.device - ) - attention_mask = torch.cat([attention_mask, extended_mask], dim=1) + cache_position = torch.arange(0, self.config.conv_kernel, device=input_ids.device) + if inputs_embeds is not None and cache_params is None: model_inputs = {"inputs_embeds": inputs_embeds} else: diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py index cd54b226e1d85c..8f6b092da6e6ad 100644 --- a/src/transformers/models/phimoe/modeling_phimoe.py +++ b/src/transformers/models/phimoe/modeling_phimoe.py @@ -33,7 +33,6 @@ ) from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import is_torch_greater_or_equal_than_1_13 from ...utils import ( add_start_docstrings, add_start_docstrings_to_model_forward, @@ -51,9 +50,6 @@ # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph. # It means that the function will not be traced through and simply appear as a node in the graph. if is_torch_fx_available(): - if not is_torch_greater_or_equal_than_1_13: - import torch.fx - _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask) diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py index 55042327de4ec3..ef98ae5e3f508f 100644 --- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py @@ -163,6 +163,16 @@ class Qwen2VLConfig(PretrainedConfig): model_type = "qwen2_vl" sub_configs = {"vision_config": Qwen2VLVisionConfig} keys_to_ignore_at_inference = ["past_key_values"] + # Default tensor parallel plan for base model `Qwen2VL` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } def __init__( self, diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index 10c9b1638548ce..566141d3f75c27 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -547,9 +547,9 @@ def forward( key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) cos, sin = position_embeddings query_states, key_states = apply_multimodal_rotary_pos_emb( @@ -631,9 +631,9 @@ def forward( key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) # Because the input can be padded, the absolute sequence length depends on the max position id. cos, sin = position_embeddings @@ -750,9 +750,9 @@ def forward( key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) cos, sin = position_embeddings query_states, key_states = apply_multimodal_rotary_pos_emb( diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index 1959d21e1d5d94..8dc3e2297d4525 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -882,15 +882,15 @@ def forward( all_self_attentions = () if output_attentions else None if attention_mask is not None: + expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) if self._use_flash_attention_2: # make sure padded tokens output 0 - hidden_states[~attention_mask] = 0.0 + hidden_states[~expand_attention_mask] = 0.0 # 2d mask is passed through the layers attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None else: # make sure padded tokens output 0 - hidden_states[~attention_mask] = 0.0 - + hidden_states[~expand_attention_mask] = 0.0 input_lengths = (attention_mask.long()).sum(-1) # apply pooling formula to get real output_lengths output_lengths = input_lengths // self.config.squeeze_factor @@ -1473,7 +1473,8 @@ def forward( pooled_output = hidden_states.mean(dim=1) else: padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask) - hidden_states[~padding_mask] = 0.0 + expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_padding_mask] = 0.0 pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1) logits = self.classifier(pooled_output) diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py index 5cccc0218e6ccf..2df687f4cc362a 100644 --- a/src/transformers/models/sew_d/modeling_sew_d.py +++ b/src/transformers/models/sew_d/modeling_sew_d.py @@ -1175,7 +1175,8 @@ def forward( ) else: # make sure padded tokens output 0 - hidden_states[~attention_mask.bool()] = 0.0 + expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_attention_mask.bool()] = 0.0 input_lengths = (attention_mask.long()).sum(-1) # apply pooling formula to get real output_lengths @@ -1721,7 +1722,8 @@ def forward( pooled_output = hidden_states.mean(dim=1) else: padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask) - hidden_states[~padding_mask] = 0.0 + expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_padding_mask] = 0.0 pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1) logits = self.classifier(pooled_output) diff --git a/src/transformers/models/superpoint/modeling_superpoint.py b/src/transformers/models/superpoint/modeling_superpoint.py index 1075de299a9f40..dcdd85460b39bd 100644 --- a/src/transformers/models/superpoint/modeling_superpoint.py +++ b/src/transformers/models/superpoint/modeling_superpoint.py @@ -25,7 +25,6 @@ ) from transformers.models.superpoint.configuration_superpoint import SuperPointConfig -from ...pytorch_utils import is_torch_greater_or_equal_than_1_13 from ...utils import ( ModelOutput, add_start_docstrings, @@ -314,7 +313,7 @@ def _sample_descriptors(keypoints, descriptors, scale: int = 8) -> torch.Tensor: divisor = divisor.to(keypoints) keypoints /= divisor keypoints = keypoints * 2 - 1 # normalize to (-1, 1) - kwargs = {"align_corners": True} if is_torch_greater_or_equal_than_1_13 else {} + kwargs = {"align_corners": True} # [batch_size, num_channels, num_keypoints, 2] -> [batch_size, num_channels, num_keypoints, 2] keypoints = keypoints.view(batch_size, 1, -1, 2) descriptors = nn.functional.grid_sample(descriptors, keypoints, mode="bilinear", **kwargs) diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py index b74a27ae5ce589..2ea0d38a23f933 100644 --- a/src/transformers/models/tapas/modeling_tapas.py +++ b/src/transformers/models/tapas/modeling_tapas.py @@ -31,7 +31,6 @@ from ...pytorch_utils import ( apply_chunking_to_forward, find_pruneable_heads_and_indices, - is_torch_greater_or_equal_than_1_12, prune_linear_layer, ) from ...utils import ( @@ -46,12 +45,6 @@ logger = logging.get_logger(__name__) -if not is_torch_greater_or_equal_than_1_12: - logger.warning( - f"You are using torch=={torch.__version__}, but torch>=1.12.0 is required to use " - "TapasModel. Please upgrade torch." - ) - _CONFIG_FOR_DOC = "TapasConfig" _CHECKPOINT_FOR_DOC = "google/tapas-base" diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index d1496432279527..f355eb03bdb82f 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -1876,7 +1876,8 @@ def forward( pooled_output = hidden_states.mean(dim=1) else: padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask) - hidden_states[~padding_mask] = 0.0 + expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_padding_mask] = 0.0 pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1) logits = self.classifier(pooled_output) diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index 49551b73577ad7..0fd6e7cb2c04e1 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -1886,7 +1886,8 @@ def forward( pooled_output = hidden_states.mean(dim=1) else: padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask) - hidden_states[~padding_mask] = 0.0 + expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_padding_mask] = 0.0 pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1) logits = self.classifier(pooled_output) diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index ca743e1eaef3af..5168904a3579d9 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -38,7 +38,6 @@ XVectorOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import is_torch_greater_or_equal_than_1_13 from ...utils import ( ModelOutput, add_code_sample_docstrings, @@ -1590,7 +1589,7 @@ def load_adapter(self, target_lang: str, force_load=True, **kwargs): cache_dir=cache_dir, ) - weights_only_kwarg = {"weights_only": True} if is_torch_greater_or_equal_than_1_13 else {} + weights_only_kwarg = {"weights_only": True} state_dict = torch.load( weight_path, map_location="cpu", @@ -2376,7 +2375,8 @@ def forward( pooled_output = hidden_states.mean(dim=1) else: padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask) - hidden_states[~padding_mask] = 0.0 + expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_padding_mask] = 0.0 pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1) logits = self.classifier(pooled_output) diff --git a/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py index 6f1d5576df7316..7774c7a4069d02 100644 --- a/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +++ b/src/transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py @@ -1359,7 +1359,8 @@ def forward( pooled_output = hidden_states.mean(dim=1) else: padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask) - hidden_states[~padding_mask] = 0.0 + expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_padding_mask] = 0.0 pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1) logits = self.classifier(pooled_output) diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py index 933bf8f6dc0bcd..494654a6774754 100644 --- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py @@ -878,7 +878,8 @@ def forward( if attention_mask is not None: # make sure padded tokens output 0 - hidden_states[~attention_mask] = 0.0 + expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_attention_mask] = 0.0 # extend attention_mask attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype) @@ -1791,7 +1792,8 @@ def forward( pooled_output = hidden_states.mean(dim=1) else: padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask) - hidden_states[~padding_mask] = 0.0 + expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_padding_mask] = 0.0 pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1) logits = self.classifier(pooled_output) diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py index 4df192fda5efa3..3e5e3790005377 100755 --- a/src/transformers/models/wavlm/modeling_wavlm.py +++ b/src/transformers/models/wavlm/modeling_wavlm.py @@ -691,7 +691,8 @@ def forward( if attention_mask is not None: # make sure padded tokens output 0 - hidden_states[~attention_mask] = 0.0 + expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_attention_mask] = 0 position_embeddings = self.pos_conv_embed(hidden_states) hidden_states = hidden_states + position_embeddings @@ -776,7 +777,8 @@ def forward( if attention_mask is not None: # make sure padded tokens are not attended to - hidden_states[~attention_mask] = 0 + expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_attention_mask] = 0 position_embeddings = self.pos_conv_embed(hidden_states) hidden_states = hidden_states + position_embeddings @@ -1508,7 +1510,8 @@ def forward( pooled_output = hidden_states.mean(dim=1) else: padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask) - hidden_states[~padding_mask] = 0.0 + expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_padding_mask] = 0.0 pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1) logits = self.classifier(pooled_output) diff --git a/src/transformers/models/zoedepth/modeling_zoedepth.py b/src/transformers/models/zoedepth/modeling_zoedepth.py index 5cbbdcdc04b756..2f4a42d2818005 100644 --- a/src/transformers/models/zoedepth/modeling_zoedepth.py +++ b/src/transformers/models/zoedepth/modeling_zoedepth.py @@ -417,7 +417,7 @@ def __init__(self, n_classes=256, act=torch.softmax): self.k = n_classes self.act = act self.register_buffer("k_idx", torch.arange(0, n_classes).view(1, -1, 1, 1), persistent=False) - self.register_buffer("k_minus_1", torch.Tensor([self.k - 1]).view(1, -1, 1, 1), persistent=False) + self.register_buffer("k_minus_1", torch.tensor([self.k - 1]).view(1, -1, 1, 1), persistent=False) def forward(self, probabilities, temperature=1.0, eps=1e-4): """Compute the log binomial distribution for probabilities. diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py index fab1b9118d18d3..95c8748375ce0a 100644 --- a/src/transformers/pytorch_utils.py +++ b/src/transformers/pytorch_utils.py @@ -34,9 +34,6 @@ is_torch_greater_or_equal_than_2_3 = parsed_torch_version_base >= version.parse("2.3") is_torch_greater_or_equal_than_2_2 = parsed_torch_version_base >= version.parse("2.2") is_torch_greater_or_equal_than_2_1 = parsed_torch_version_base >= version.parse("2.1") -is_torch_greater_or_equal_than_2_0 = parsed_torch_version_base >= version.parse("2.0") -is_torch_greater_or_equal_than_1_13 = parsed_torch_version_base >= version.parse("1.13") -is_torch_greater_or_equal_than_1_12 = parsed_torch_version_base >= version.parse("1.12") # Cache this result has it's a C FFI call which can be pretty time-consuming _torch_distributed_available = torch.distributed.is_available() diff --git a/src/transformers/quantizers/auto.py b/src/transformers/quantizers/auto.py index 818072a0d91647..47b54cd27bcebe 100755 --- a/src/transformers/quantizers/auto.py +++ b/src/transformers/quantizers/auto.py @@ -29,6 +29,7 @@ QuantizationMethod, QuantoConfig, TorchAoConfig, + VptqConfig, ) from .quantizer_aqlm import AqlmHfQuantizer from .quantizer_awq import AwqQuantizer @@ -42,6 +43,7 @@ from .quantizer_hqq import HqqHfQuantizer from .quantizer_quanto import QuantoHfQuantizer from .quantizer_torchao import TorchAoHfQuantizer +from .quantizer_vptq import VptqHfQuantizer AUTO_QUANTIZER_MAPPING = { @@ -57,6 +59,7 @@ "fbgemm_fp8": FbgemmFp8HfQuantizer, "torchao": TorchAoHfQuantizer, "bitnet": BitNetHfQuantizer, + "vptq": VptqHfQuantizer, } AUTO_QUANTIZATION_CONFIG_MAPPING = { @@ -72,6 +75,7 @@ "fbgemm_fp8": FbgemmFp8Config, "torchao": TorchAoConfig, "bitnet": BitNetConfig, + "vptq": VptqConfig, } diff --git a/src/transformers/quantizers/quantizer_vptq.py b/src/transformers/quantizers/quantizer_vptq.py new file mode 100644 index 00000000000000..1672c3ebc5a7d3 --- /dev/null +++ b/src/transformers/quantizers/quantizer_vptq.py @@ -0,0 +1,98 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING, Optional + +from .base import HfQuantizer + + +if TYPE_CHECKING: + from ..modeling_utils import PreTrainedModel + +from ..utils import is_accelerate_available, is_torch_available, is_vptq_available, logging +from ..utils.quantization_config import QuantizationConfigMixin + + +if is_torch_available(): + import torch + +logger = logging.get_logger(__name__) + + +class VptqHfQuantizer(HfQuantizer): + """ + Quantizer of the VPTQ method. Enables the loading of prequantized models. + """ + + requires_calibration = True + required_packages = ["vptq"] + + def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs): + super().__init__(quantization_config, **kwargs) + self.quantization_config = quantization_config + + def validate_environment(self, *args, **kwargs): + if not is_accelerate_available(): + raise ImportError("Using `vptq` quantization requires Accelerate: `pip install accelerate`") + + if not is_vptq_available(): + raise ImportError("Using `vptq` quantization requires VPTQ>=0.0.4: `pip install -U vptq`") + + def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": + if torch_dtype is None: + if torch.cuda.is_available(): + torch_dtype = torch.float16 + logger.info( + "CUDA available. Assuming VPTQ inference on GPU and loading the model in `torch.float16`. To overwrite it, set `torch_dtype` manually." + ) + else: + import vptq + + device_availability = getattr(vptq, "device_availability", lambda device: False) + if device_availability("cpu") is True: + raise RuntimeError("No GPU found. Please wait for the next release of VPTQ to use CPU inference") + torch_dtype = torch.float32 + logger.info("No GPU found. Assuming VPTQ inference on CPU and loading the model in `torch.float32`.") + return torch_dtype + + def _process_model_before_weight_loading( + self, + model: "PreTrainedModel", + **kwargs, + ): + """ + we don't have param like modules_to_not_convert to indicate which layers should not be quantized + because `quantization_config` include the layers that should be quantized + """ + from ..integrations import replace_with_vptq_linear + + modules_to_not_convert = kwargs.get("modules_to_not_convert", []) + ( + self.quantization_config.modules_to_not_convert or [] + ) + + replace_with_vptq_linear( + model, + quantization_config=self.quantization_config, + modules_to_not_convert=modules_to_not_convert, + ) + model.config.quantization_config = self.quantization_config + + def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs): + return model + + @property + def is_trainable(self, model: Optional["PreTrainedModel"] = None): + return False + + def is_serializable(self, safe_serialization=None): + return True diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 409f274d41eb17..2f523ed36d983f 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -14,6 +14,7 @@ import collections import contextlib +import copy import doctest import functools import gc @@ -142,6 +143,7 @@ is_torchdynamo_available, is_torchvision_available, is_vision_available, + is_vptq_available, strtobool, ) @@ -1142,6 +1144,13 @@ def require_aqlm(test_case): return unittest.skipUnless(is_aqlm_available(), "test requires aqlm")(test_case) +def require_vptq(test_case): + """ + Decorator marking a test that requires vptq + """ + return unittest.skipUnless(is_vptq_available(), "test requires vptq")(test_case) + + def require_eetq(test_case): """ Decorator marking a test that requires eetq @@ -1388,6 +1397,53 @@ def assert_screenout(out, what): assert match_str != -1, f"expecting to find {what} in output: f{out_pr}" +def set_model_tester_for_less_flaky_test(test_case): + if hasattr(test_case.model_tester, "num_hidden_layers"): + test_case.model_tester.num_hidden_layers = 1 + if ( + hasattr(test_case.model_tester, "vision_config") + and "num_hidden_layers" in test_case.model_tester.vision_config + ): + test_case.model_tester.vision_config = copy.deepcopy(test_case.model_tester.vision_config) + test_case.model_tester.vision_config["num_hidden_layers"] = 1 + if hasattr(test_case.model_tester, "text_config") and "num_hidden_layers" in test_case.model_tester.text_config: + test_case.model_tester.text_config = copy.deepcopy(test_case.model_tester.text_config) + test_case.model_tester.text_config["num_hidden_layers"] = 1 + + +def set_config_for_less_flaky_test(config): + target_attrs = [ + "rms_norm_eps", + "layer_norm_eps", + "norm_eps", + "norm_epsilon", + "layer_norm_epsilon", + "batch_norm_eps", + ] + for target_attr in target_attrs: + setattr(config, target_attr, 1.0) + + # norm layers (layer/group norm, etc.) could cause flaky tests when the tensors have very small variance. + # (We don't need the original epsilon values to check eager/sdpa matches) + attrs = ["text_config", "vision_config", "text_encoder", "audio_encoder", "decoder"] + for attr in attrs: + if hasattr(config, attr): + for target_attr in target_attrs: + setattr(getattr(config, attr), target_attr, 1.0) + + +def set_model_for_less_flaky_test(model): + # Another way to make sure norm layers have desired epsilon. (Some models don't set it from its config.) + target_names = ("LayerNorm", "GroupNorm", "BatchNorm", "RMSNorm", "BatchNorm2d", "BatchNorm1d") + target_attrs = ["eps", "epsilon", "variance_epsilon"] + if is_torch_available() and isinstance(model, torch.nn.Module): + for module in model.modules(): + if type(module).__name__.endswith(target_names): + for attr in target_attrs: + if hasattr(module, attr): + setattr(module, attr, 1.0) + + class CaptureStd: """ Context manager to capture: diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 4d90c13df825f2..c878d2b345cc31 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -75,7 +75,6 @@ from .processing_utils import ProcessorMixin from .pytorch_utils import ( ALL_LAYERNORM_LAYERS, - is_torch_greater_or_equal_than_1_13, is_torch_greater_or_equal_than_2_3, ) from .tokenization_utils_base import PreTrainedTokenizerBase @@ -2778,7 +2777,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None): ) if os.path.isfile(weights_file) or os.path.isfile(safe_weights_file) or is_fsdp_ckpt: - weights_only_kwarg = {"weights_only": True} if is_torch_greater_or_equal_than_1_13 else {} + weights_only_kwarg = {"weights_only": True} # If the model is on the GPU, it still works! if is_sagemaker_mp_enabled(): if os.path.isfile(os.path.join(resume_from_checkpoint, "user_content.pt")): @@ -2899,7 +2898,7 @@ def _load_best_model(self): or os.path.exists(best_safe_adapter_model_path) ): has_been_loaded = True - weights_only_kwarg = {"weights_only": True} if is_torch_greater_or_equal_than_1_13 else {} + weights_only_kwarg = {"weights_only": True} if is_sagemaker_mp_enabled(): if os.path.isfile(os.path.join(self.state.best_model_checkpoint, "user_content.pt")): # If the 'user_content.pt' file exists, load with the new smp api. diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py index 5f78860fe6c115..da95329e184567 100644 --- a/src/transformers/trainer_pt_utils.py +++ b/src/transformers/trainer_pt_utils.py @@ -56,12 +56,7 @@ import torch_xla.core.xla_model as xm if is_torch_available(): - from .pytorch_utils import is_torch_greater_or_equal_than_2_0 - - if is_torch_greater_or_equal_than_2_0: - from torch.optim.lr_scheduler import LRScheduler - else: - from torch.optim.lr_scheduler import _LRScheduler as LRScheduler + from torch.optim.lr_scheduler import LRScheduler logger = logging.get_logger(__name__) diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 6b141cff39e1f7..6950e8e66d3ac1 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -71,8 +71,6 @@ import torch import torch.distributed as dist - from .pytorch_utils import is_torch_greater_or_equal_than_2_0 - if is_accelerate_available(): from accelerate.state import AcceleratorState, PartialState from accelerate.utils import DistributedType @@ -1157,7 +1155,7 @@ class TrainingArguments: }, ) dataloader_prefetch_factor: Optional[int] = field( - default=None if not is_torch_available() or is_torch_greater_or_equal_than_2_0 else 2, + default=None, metadata={ "help": ( "Number of batches loaded in advance by each worker. " @@ -1702,14 +1700,6 @@ def __post_init__(self): raise ValueError( "Your setup doesn't support bf16/gpu. You need torch>=1.10, using Ampere GPU with cuda>=11.0" ) - elif not is_torch_xpu_available(): - # xpu - from .pytorch_utils import is_torch_greater_or_equal_than_1_12 - - if not is_torch_greater_or_equal_than_1_12: - raise ValueError( - "Your setup doesn't support bf16/xpu. You need torch>=1.12, using Intel XPU/GPU with IPEX installed" - ) if self.fp16 and self.bf16: raise ValueError("At most one of fp16 and bf16 can be True, but not both") @@ -2056,11 +2046,7 @@ def __post_init__(self): if self.use_cpu: self.dataloader_pin_memory = False - if ( - (not is_torch_available() or is_torch_greater_or_equal_than_2_0) - and self.dataloader_num_workers == 0 - and self.dataloader_prefetch_factor is not None - ): + if self.dataloader_num_workers == 0 and self.dataloader_prefetch_factor is not None: raise ValueError( "--dataloader_prefetch_factor can only be set when data is loaded in a different process, i.e." " when --dataloader_num_workers > 1." diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index 7fb647b253832e..2edfcdcd101c78 100755 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -233,6 +233,7 @@ is_training_run_on_sagemaker, is_uroman_available, is_vision_available, + is_vptq_available, requires_backends, torch_only_method, ) diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py index 101b34182a7309..45fa3d9ca68c51 100755 --- a/src/transformers/utils/fx.py +++ b/src/transformers/utils/fx.py @@ -60,7 +60,6 @@ MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_MAPPING_NAMES, ) -from ..pytorch_utils import is_torch_greater_or_equal_than_2_0 from .import_utils import ( ENV_VARS_TRUE_VALUES, TORCH_FX_REQUIRED_VERSION, @@ -635,10 +634,9 @@ def to_concrete(t): operator.getitem: operator_getitem, } -if is_torch_greater_or_equal_than_2_0: - _MANUAL_META_OVERRIDES[torch.nn.functional.scaled_dot_product_attention] = ( - torch_nn_functional_scaled_dot_product_attention - ) +_MANUAL_META_OVERRIDES[torch.nn.functional.scaled_dot_product_attention] = ( + torch_nn_functional_scaled_dot_product_attention +) class HFProxy(Proxy): diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index 92823a4ee016c3..cfc8b88fd81ed6 100755 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -93,11 +93,13 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[ GGUF_MIN_VERSION = "0.10.0" XLA_FSDPV2_MIN_VERSION = "2.2.0" HQQ_MIN_VERSION = "0.2.1" +VPTQ_MIN_VERSION = "0.0.4" _accelerate_available, _accelerate_version = _is_package_available("accelerate", return_version=True) _apex_available = _is_package_available("apex") _aqlm_available = _is_package_available("aqlm") +_vptq_available, _vptq_version = _is_package_available("vptq", return_version=True) _av_available = importlib.util.find_spec("av") is not None _bitsandbytes_available = _is_package_available("bitsandbytes") _eetq_available = _is_package_available("eetq") @@ -816,6 +818,10 @@ def is_aqlm_available(): return _aqlm_available +def is_vptq_available(min_version: str = VPTQ_MIN_VERSION): + return _vptq_available and version.parse(_vptq_version) >= version.parse(min_version) + + def is_av_available(): return _av_available diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 253cc4a0621080..44e47e4f6e65c2 100755 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -39,6 +39,7 @@ class QuantizationMethod(str, Enum): GPTQ = "gptq" AWQ = "awq" AQLM = "aqlm" + VPTQ = "vptq" QUANTO = "quanto" EETQ = "eetq" HQQ = "hqq" @@ -994,6 +995,102 @@ def post_init(self): self.linear_weights_not_to_quantize = [] +@dataclass +class VptqLayerConfig(QuantizationConfigMixin): + """ + This is used to explain vptq config params for each layer + Args: + enable_norm (`bool`, *optional*, defaults to `True`): to control if we have scale/bias for fp-weight + enable_perm (`bool`, *optional*, defaults to `True`): to perm input_channel or not + group_num (`int`, *optional*, defaults to `1`): how many single groups for vector-quantization + group_size (`int`, *optional*, defaults to `-1`): depends on out-features + indices_as_float (`bool`, *optional*, defaults to `False`): for Finetuning + is_indice_packed (`bool`, *optional*, defaults to `True`): should always be True + num_centroids (`list`, *optional*, defaults to `[-1, -1]`): centriod numbers of clusters + num_res_centroids (`list`, *optional*, defaults to `[-1, -1]`): ditto for residual + outlier_size (`int`, *optional*, defaults to `1`): outliers + vector_lens (`list`, *optional*, defaults to `[-1, -1]`): centroid vector length in quantization + """ + + def __init__( + self, + enable_norm: bool = True, + enable_perm: bool = True, + group_num: int = 1, + group_size: int = -1, + in_features: int = -1, + indices_as_float: bool = False, + is_indice_packed: bool = True, + num_centroids: tuple = [-1, -1], + num_res_centroids: tuple = [-1, -1], + out_features: int = -1, + outlier_size: int = 0, + vector_lens: tuple = [-1, -1], + **kwargs, + ): + self.enable_norm = enable_norm + self.enable_perm = enable_perm + self.group_num = group_num + self.group_size = group_size + self.in_features = in_features + self.indices_as_float = indices_as_float + self.is_indice_packed = is_indice_packed + self.num_centroids = num_centroids + self.num_res_centroids = num_res_centroids + self.out_features = out_features + self.outlier_size = outlier_size + self.vector_lens = vector_lens + self.post_init() + + def post_init(self): + r""" + Safety checker that arguments are correct + """ + if self.is_indice_packed is False: + raise ValueError("is_indice_packed should always be True") + + +@dataclass +class VptqConfig(QuantizationConfigMixin): + """ + This is a wrapper class about `vptq` parameters. + + Args: + enable_proxy_error (`bool`, *optional*, defaults to `False`): calculate proxy error for each layer + config_for_layers (`Dict`, *optional*, defaults to `{}`): quantization params for each layer + shared_layer_config (`Dict`, *optional*, defaults to `{}`): shared quantization params among layers + modules_to_not_convert (`list`, *optional*, default to `None`): + The list of modules to not quantize, useful for quantizing models that explicitly require to have + some modules left in their original precision (e.g. Whisper encoder, Llava encoder, Mixtral gate layers). + kwargs (`Dict[str, Any]`, *optional*): + Additional parameters from which to initialize the configuration object. + """ + + def __init__( + self, + enable_proxy_error: bool = False, + config_for_layers: Dict[str, Any] = {}, + shared_layer_config: Dict[str, Any] = {}, + modules_to_not_convert: Optional[List] = None, + **kwargs, + ): + self.quant_method = QuantizationMethod.VPTQ + self.enable_proxy_error = enable_proxy_error + self.config_for_layers: Dict[str, Any] = config_for_layers + self.shared_layer_config: Dict[str, Any] = shared_layer_config + self.modules_to_not_convert = modules_to_not_convert + self.post_init() + + def post_init(self): + r""" + Safety checker that arguments are correct + """ + for layer_name, layer_param in self.config_for_layers.items(): + VptqLayerConfig(**layer_param) + if self.enable_proxy_error is True: + raise ValueError("enable_proxy_error should always be False until we support training") + + @dataclass class QuantoConfig(QuantizationConfigMixin): """ diff --git a/tests/generation/test_streamers.py b/tests/generation/test_streamers.py index c82a5e99e0ded0..be8c37334d02fc 100644 --- a/tests/generation/test_streamers.py +++ b/tests/generation/test_streamers.py @@ -17,7 +17,15 @@ from queue import Empty from threading import Thread -from transformers import AutoTokenizer, TextIteratorStreamer, TextStreamer, is_torch_available +import pytest + +from transformers import ( + AsyncTextIteratorStreamer, + AutoTokenizer, + TextIteratorStreamer, + TextStreamer, + is_torch_available, +) from transformers.testing_utils import CaptureStdout, require_torch, torch_device from ..test_modeling_common import ids_tensor @@ -120,3 +128,43 @@ def test_iterator_streamer_timeout(self): streamer_text = "" for new_text in streamer: streamer_text += new_text + + +@require_torch +@pytest.mark.asyncio(loop_scope="class") +class AsyncStreamerTester(unittest.IsolatedAsyncioTestCase): + async def test_async_iterator_streamer_matches_non_streaming(self): + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") + model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device) + model.config.eos_token_id = -1 + + input_ids = ids_tensor((1, 5), vocab_size=model.config.vocab_size).to(torch_device) + greedy_ids = model.generate(input_ids, max_new_tokens=10, do_sample=False) + greedy_text = tokenizer.decode(greedy_ids[0]) + + streamer = AsyncTextIteratorStreamer(tokenizer) + generation_kwargs = {"input_ids": input_ids, "max_new_tokens": 10, "do_sample": False, "streamer": streamer} + thread = Thread(target=model.generate, kwargs=generation_kwargs) + thread.start() + streamer_text = "" + async for new_text in streamer: + streamer_text += new_text + + self.assertEqual(streamer_text, greedy_text) + + async def test_async_iterator_streamer_timeout(self): + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") + model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device) + model.config.eos_token_id = -1 + + input_ids = ids_tensor((1, 5), vocab_size=model.config.vocab_size).to(torch_device) + streamer = AsyncTextIteratorStreamer(tokenizer, timeout=0.001) + generation_kwargs = {"input_ids": input_ids, "max_new_tokens": 10, "do_sample": False, "streamer": streamer} + thread = Thread(target=model.generate, kwargs=generation_kwargs) + thread.start() + + # The streamer will timeout after 0.001 seconds, so TimeoutError will be raised + with self.assertRaises(TimeoutError): + streamer_text = "" + async for new_text in streamer: + streamer_text += new_text diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index e85f2663624740..4ac22e77779022 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -37,6 +37,9 @@ require_torch_multi_accelerator, require_torch_multi_gpu, require_torch_sdpa, + set_config_for_less_flaky_test, + set_model_for_less_flaky_test, + set_model_tester_for_less_flaky_test, slow, torch_device, ) @@ -1921,11 +1924,13 @@ def test_generate_with_static_cache(self): Tests that generating with static cache give almost same results as with dynamic cache, and the output cache has the expected shapes """ + set_model_tester_for_less_flaky_test(self) for model_class in self.all_generative_model_classes: if not model_class._supports_static_cache: self.skipTest(reason="This model does not support the static cache format") config, inputs_dict = self.prepare_config_and_inputs_for_generate() + set_config_for_less_flaky_test(config) main_input = inputs_dict[model_class.main_input_name] if config.is_encoder_decoder: @@ -1938,6 +1943,8 @@ def test_generate_with_static_cache(self): for dtype in (torch.float32, torch.float16): model = model_class(config).to(torch_device).to(dtype).eval() + set_model_for_less_flaky_test(model) + generation_kwargs = { "max_new_tokens": max_new_tokens, "return_dict_in_generate": True, # Required to return `past_key_values` diff --git a/tests/models/aria/test_modeling_aria.py b/tests/models/aria/test_modeling_aria.py index d3458530ac349e..b6f1da56c6782e 100644 --- a/tests/models/aria/test_modeling_aria.py +++ b/tests/models/aria/test_modeling_aria.py @@ -45,8 +45,7 @@ if is_torch_available(): import torch -else: - is_torch_greater_or_equal_than_2_0 = False + if is_vision_available(): from PIL import Image diff --git a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py index 893132f4337dd4..f02e8f167636eb 100644 --- a/tests/models/falcon_mamba/test_modeling_falcon_mamba.py +++ b/tests/models/falcon_mamba/test_modeling_falcon_mamba.py @@ -43,9 +43,6 @@ FalconMambaModel, ) from transformers.cache_utils import MambaCache - from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_0 -else: - is_torch_greater_or_equal_than_2_0 = False # Copied from transformers.tests.models.mamba.MambaModelTester with Mamba->FalconMamba,mamba->falcon_mamba @@ -246,9 +243,6 @@ def prepare_config_and_inputs_for_common(self): return config, inputs_dict -@unittest.skipIf( - not is_torch_greater_or_equal_than_2_0, reason="See https://github.com/huggingface/transformers/pull/24204" -) @require_torch # Copied from transformers.tests.models.mamba.MambaModelTest with Mamba->Falcon,mamba->falcon_mamba,FalconMambaCache->MambaCache class FalconMambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): diff --git a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py index 1db484c4062c35..281594492500b0 100644 --- a/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py +++ b/tests/models/gpt_bigcode/test_modeling_gpt_bigcode.py @@ -37,9 +37,6 @@ GPTBigCodeModel, ) from transformers.models.gpt_bigcode.modeling_gpt_bigcode import GPTBigCodeAttention - from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_12 -else: - is_torch_greater_or_equal_than_1_12 = False class GPTBigCodeModelTester: @@ -504,10 +501,6 @@ class GPTBigCodeMHAModelTest(GPTBigCodeModelTest): multi_query = False -@unittest.skipIf( - not is_torch_greater_or_equal_than_1_12, - reason="`GPTBigCode` checkpoints use `PytorchGELUTanh` which requires `torch>=1.12.0`.", -) @slow @require_torch class GPTBigCodeModelLanguageGenerationTest(unittest.TestCase): diff --git a/tests/models/gptj/test_modeling_gptj.py b/tests/models/gptj/test_modeling_gptj.py index afc741cd502dec..50840bbcfaa6dc 100644 --- a/tests/models/gptj/test_modeling_gptj.py +++ b/tests/models/gptj/test_modeling_gptj.py @@ -41,9 +41,6 @@ GPTJForSequenceClassification, GPTJModel, ) - from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_12 -else: - is_torch_greater_or_equal_than_1_12 = False class GPTJModelTester: @@ -363,15 +360,9 @@ class GPTJModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin test_model_parallel = False test_head_masking = False - @unittest.skipIf( - not is_torch_greater_or_equal_than_1_12, reason="PR #22069 made changes that require torch v1.12+." - ) def test_torch_fx(self): super().test_torch_fx() - @unittest.skipIf( - not is_torch_greater_or_equal_than_1_12, reason="PR #22069 made changes that require torch v1.12+." - ) def test_torch_fx_output_loss(self): super().test_torch_fx_output_loss() diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py index 12004cc3c8ad89..94229b13d2cbfe 100644 --- a/tests/models/idefics/test_modeling_idefics.py +++ b/tests/models/idefics/test_modeling_idefics.py @@ -44,9 +44,6 @@ from transformers import IdeficsForVisionText2Text, IdeficsModel, IdeficsProcessor from transformers.models.idefics.configuration_idefics import IdeficsPerceiverConfig, IdeficsVisionConfig - from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_0 -else: - is_torch_greater_or_equal_than_2_0 = False if is_vision_available(): from PIL import Image @@ -327,7 +324,6 @@ def test_eager_matches_sdpa_generate(self): self.skipTest(reason="Idefics has a hard requirement on SDPA, skipping this test") -@unittest.skipIf(not is_torch_greater_or_equal_than_2_0, reason="pytorch 2.0 or higher is required") @require_torch class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = (IdeficsModel, IdeficsForVisionText2Text) if is_torch_available() else () @@ -594,7 +590,6 @@ def test_sdpa_can_dispatch_non_composite_models(self): pass -@unittest.skipIf(not is_torch_greater_or_equal_than_2_0, reason="pytorch 2.0 or higher is required") @require_torch class IdeficsForVisionText2TextTest(IdeficsModelTest, GenerationTesterMixin, unittest.TestCase): all_model_classes = (IdeficsForVisionText2Text,) if is_torch_available() else () @@ -818,7 +813,6 @@ def test_sdpa_can_dispatch_non_composite_models(self): pass -@unittest.skipIf(not is_torch_greater_or_equal_than_2_0, reason="pytorch 2.0 or higher is required") @require_torch @require_vision class IdeficsModelIntegrationTest(TestCasePlus): diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py index 83e125c07c15bc..974628c8b4324f 100644 --- a/tests/models/idefics2/test_modeling_idefics2.py +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -48,8 +48,6 @@ if is_torch_available(): import torch -else: - is_torch_greater_or_equal_than_2_0 = False if is_vision_available(): from PIL import Image diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py index 5bfd4c3f3c0e83..c25fa1180649fa 100644 --- a/tests/models/idefics3/test_modeling_idefics3.py +++ b/tests/models/idefics3/test_modeling_idefics3.py @@ -40,8 +40,6 @@ Idefics3ForConditionalGeneration, Idefics3Model, ) -else: - is_torch_greater_or_equal_than_2_0 = False if is_vision_available(): from PIL import Image diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py index 3d08ab35e0f630..b4a959a00d2a0c 100644 --- a/tests/models/llava/test_modeling_llava.py +++ b/tests/models/llava/test_modeling_llava.py @@ -43,8 +43,7 @@ if is_torch_available(): import torch -else: - is_torch_greater_or_equal_than_2_0 = False + if is_vision_available(): from PIL import Image diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py index c258ce96b94e48..14b0fb8cc07db7 100644 --- a/tests/models/llava_next/test_modeling_llava_next.py +++ b/tests/models/llava_next/test_modeling_llava_next.py @@ -48,8 +48,7 @@ import torch from transformers.models.llava_next.modeling_llava_next import image_size_to_num_patches -else: - is_torch_greater_or_equal_than_2_0 = False + if is_vision_available(): from PIL import Image diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py index a6fb341ff9bf56..c431f91bf5102f 100644 --- a/tests/models/llava_next_video/test_modeling_llava_next_video.py +++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py @@ -48,8 +48,6 @@ if is_torch_available(): import torch -else: - is_torch_greater_or_equal_than_2_0 = False if is_vision_available(): from PIL import Image diff --git a/tests/models/llava_onevision/test_modeling_llava_onevision.py b/tests/models/llava_onevision/test_modeling_llava_onevision.py index a217eee2c70671..6965d2033ec730 100644 --- a/tests/models/llava_onevision/test_modeling_llava_onevision.py +++ b/tests/models/llava_onevision/test_modeling_llava_onevision.py @@ -48,8 +48,6 @@ if is_torch_available(): import torch -else: - is_torch_greater_or_equal_than_2_0 = False if is_vision_available(): from PIL import Image diff --git a/tests/models/mamba/test_modeling_mamba.py b/tests/models/mamba/test_modeling_mamba.py index d432dfa93df487..455022140f7c5b 100644 --- a/tests/models/mamba/test_modeling_mamba.py +++ b/tests/models/mamba/test_modeling_mamba.py @@ -38,9 +38,6 @@ MambaModel, ) from transformers.models.mamba.modeling_mamba import MambaCache - from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_0 -else: - is_torch_greater_or_equal_than_2_0 = False class MambaModelTester: @@ -239,9 +236,6 @@ def prepare_config_and_inputs_for_common(self): return config, inputs_dict -@unittest.skipIf( - not is_torch_greater_or_equal_than_2_0, reason="See https://github.com/huggingface/transformers/pull/24204" -) @require_torch class MambaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = (MambaModel, MambaForCausalLM) if is_torch_available() else () diff --git a/tests/models/mamba2/test_modeling_mamba2.py b/tests/models/mamba2/test_modeling_mamba2.py index 9b3a9563b58ddc..17cbdc1e8d51dd 100644 --- a/tests/models/mamba2/test_modeling_mamba2.py +++ b/tests/models/mamba2/test_modeling_mamba2.py @@ -21,6 +21,7 @@ from transformers import AutoTokenizer, Mamba2Config, is_torch_available from transformers.testing_utils import require_read_token, require_torch, require_torch_gpu, slow, torch_device +from transformers.utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -36,9 +37,6 @@ Mamba2Model, ) from transformers.models.mamba2.modeling_mamba2 import Mamba2Cache, Mamba2Mixer - from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_0 -else: - is_torch_greater_or_equal_than_2_0 = False class Mamba2ModelTester: @@ -103,6 +101,10 @@ def prepare_config_and_inputs( ): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + # Only left padding is valid + attention_mask = torch.ones(size=(self.batch_size, self.seq_length), device=input_ids.device, dtype=torch.long) + attention_mask[0, :1] = 0 + sequence_labels = None token_labels = None choice_labels = None @@ -118,7 +120,7 @@ def prepare_config_and_inputs( return ( config, input_ids, - None, + attention_mask, sequence_labels, token_labels, choice_labels, @@ -158,10 +160,57 @@ def prepare_config_and_inputs_for_common(self): inputs_dict = {"input_ids": input_ids} return config, inputs_dict + def create_and_check_mamba2_caching(self, config, input_ids, attention_mask, *args): + model = Mamba2Model(config=config) + model.to(torch_device) + model.eval() + + output_whole = model(input_ids, attention_mask=attention_mask).last_hidden_state + + outputs = model( + input_ids[:, :-1], + attention_mask=attention_mask[:, :-1], + use_cache=True, + cache_position=torch.arange(0, config.conv_kernel, device=input_ids.device), + ) + output_one = outputs.last_hidden_state + + # Using the state computed on the first inputs, we will get the same output + outputs = model( + input_ids[:, -1:], + attention_mask=attention_mask[:, -1:], + use_cache=True, + cache_params=outputs.cache_params, + cache_position=torch.arange(config.conv_kernel, config.conv_kernel + 1, device=input_ids.device), + ) + output_two = outputs.last_hidden_state + + self.parent.assertTrue( + torch.allclose(torch.cat([output_one, output_two], dim=1), output_whole, atol=1e-3, rtol=1e-3) + ) + + def create_and_check_mamba2_slow_vs_fast_forward(self, config, input_ids, *args, gradient_checkpointing=False): + model = Mamba2Model(config) + model.eval() + + if not (is_mamba_2_ssm_available() and is_causal_conv1d_available()): + self.parent.skipTest( + "This test needs the Mamba2 fast path. Skipping as the necessary packages have not been found." + ) + if torch_device != "cuda": + self.parent.skipTest("This test needs the Mamba2 fast path. Skipping as we need a cuda capable device.") + + model.to(torch_device) + if gradient_checkpointing: + model.gradient_checkpointing_enable() + + token_emb = model.embeddings(input_ids) + outputs_fast = model.layers[0].mixer.cuda_kernels_forward(token_emb) + outputs_slow = model.layers[0].mixer.torch_forward(token_emb) + + self.parent.assertTrue(torch.allclose(outputs_fast, outputs_slow, atol=1e-3, rtol=1e-3)) + -@unittest.skipIf( - not is_torch_greater_or_equal_than_2_0, reason="See https://github.com/huggingface/transformers/pull/24204" -) @require_torch class Mamba2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = (Mamba2Model, Mamba2ForCausalLM) if is_torch_available() else () @@ -184,6 +233,14 @@ def setUp(self): self, config_class=Mamba2Config, n_embd=37, common_properties=["hidden_size", "num_hidden_layers"] ) + def test_mamba2_caching(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_mamba2_caching(*config_and_inputs) + + def test_mamba2_slow_vs_fast_forward(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_mamba2_slow_vs_fast_forward(*config_and_inputs) + def test_initialization(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() @@ -199,23 +256,6 @@ def test_initialization(self): def test_tied_weights_keys(self): pass - @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case") - def test_generate_without_input_ids(self): - pass - - @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case") - @parameterized.expand([("greedy", 1), ("beam search", 2)]) - def test_generate_from_inputs_embeds(self, _, num_beams): - pass - - @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case") - def test_greedy_generate_dict_outputs_use_cache(self): - pass - - @unittest.skip(reason="To fix, Mamba 2 cache slicing is interacting with beam search") - def test_beam_search_generate_dict_outputs_use_cache(self): - pass - @unittest.skip(reason="A large mamba2 would be necessary (and costly) for that") def test_multi_gpu_data_parallel_forward(self): pass diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py index bc8baa2746adde..98b554be65fbf9 100644 --- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py +++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py @@ -41,6 +41,9 @@ require_torch_gpu, require_torch_sdpa, require_torchaudio, + set_config_for_less_flaky_test, + set_model_for_less_flaky_test, + set_model_tester_for_less_flaky_test, slow, torch_device, ) @@ -516,8 +519,11 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): def get_mean_reldiff(failcase, x, ref, atol, rtol): return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}" + set_model_tester_for_less_flaky_test(self) + for model_class in self.all_model_classes: config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + set_config_for_less_flaky_test(config) model = model_class(config) is_encoder_decoder = model.config.is_encoder_decoder @@ -534,6 +540,9 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): ) model_eager = model_eager.eval().to(torch_device) + set_model_for_less_flaky_test(model_eager) + set_model_for_less_flaky_test(model_sdpa) + # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model, # but it would be nicer to have an efficient way to use parameterized.expand fail_cases = [] @@ -1528,8 +1537,11 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): def get_mean_reldiff(failcase, x, ref, atol, rtol): return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}" + set_model_tester_for_less_flaky_test(self) + for model_class in self.all_model_classes: config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + set_config_for_less_flaky_test(config) model = model_class(config) is_encoder_decoder = model.config.is_encoder_decoder @@ -1546,6 +1558,9 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): ) model_eager = model_eager.eval().to(torch_device) + set_model_for_less_flaky_test(model_eager) + set_model_for_less_flaky_test(model_sdpa) + # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model, # but it would be nicer to have an efficient way to use parameterized.expand fail_cases = [] diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py index 5ffea7ffe55087..f973e1211dc081 100644 --- a/tests/models/paligemma/test_modeling_paligemma.py +++ b/tests/models/paligemma/test_modeling_paligemma.py @@ -40,8 +40,7 @@ if is_torch_available(): import torch -else: - is_torch_greater_or_equal_than_2_0 = False + if is_vision_available(): from PIL import Image diff --git a/tests/models/pixtral/test_modeling_pixtral.py b/tests/models/pixtral/test_modeling_pixtral.py index 0c36cb5a4e0554..3e5667caf45e3e 100644 --- a/tests/models/pixtral/test_modeling_pixtral.py +++ b/tests/models/pixtral/test_modeling_pixtral.py @@ -33,8 +33,7 @@ if is_torch_available(): import torch -else: - is_torch_greater_or_equal_than_2_0 = False + if is_vision_available(): pass diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py index 4806ec2c72d339..8974d6923b391c 100644 --- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py +++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py @@ -41,8 +41,6 @@ if is_torch_available(): import torch -else: - is_torch_greater_or_equal_than_2_0 = False class Qwen2AudioModelTester: diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index 93ed33ae774458..2c27e1a03a647c 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -47,8 +47,6 @@ if is_torch_available(): import torch -else: - is_torch_greater_or_equal_than_2_0 = False if is_vision_available(): from PIL import Image diff --git a/tests/models/rwkv/test_modeling_rwkv.py b/tests/models/rwkv/test_modeling_rwkv.py index 5e82956e3efa6c..0bc5c2de070135 100644 --- a/tests/models/rwkv/test_modeling_rwkv.py +++ b/tests/models/rwkv/test_modeling_rwkv.py @@ -33,9 +33,6 @@ RwkvForCausalLM, RwkvModel, ) - from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_0 -else: - is_torch_greater_or_equal_than_2_0 = False class RwkvModelTester: @@ -231,9 +228,6 @@ def prepare_config_and_inputs_for_common(self): return config, inputs_dict -@unittest.skipIf( - not is_torch_greater_or_equal_than_2_0, reason="See https://github.com/huggingface/transformers/pull/24204" -) @require_torch class RwkvModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = (RwkvModel, RwkvForCausalLM) if is_torch_available() else () @@ -440,9 +434,6 @@ def test_left_padding_compatibility(self): pass -@unittest.skipIf( - not is_torch_greater_or_equal_than_2_0, reason="See https://github.com/huggingface/transformers/pull/24204" -) @slow class RWKVIntegrationTests(unittest.TestCase): def setUp(self): diff --git a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py index 15f1219556cd0f..276375c7e85439 100644 --- a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py +++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py @@ -840,7 +840,13 @@ def test_generation_languages(self): def test_speech_generation(self): config, input_speech, input_text = self.prepare_speech_and_text_input() + from transformers.testing_utils import set_config_for_less_flaky_test, set_model_for_less_flaky_test + + set_config_for_less_flaky_test(config) + model = SeamlessM4Tv2Model(config=config) + set_model_for_less_flaky_test(model) + self.update_generation(model) model.save_pretrained(self.tmpdirname) model.to(torch_device) @@ -852,6 +858,11 @@ def test_speech_generation(self): state_dict = model.state_dict() text_model = SeamlessM4Tv2ForTextToSpeech.from_pretrained(self.tmpdirname) + # Even if this component is loaded after `model.save_pretrained` which is after + # `set_model_for_less_flaky_test(model)`, we still need to apply `set_model_for_less_flaky_test` here as the + # `eps` attribute in the model's norm layers is not set from the config. + set_model_for_less_flaky_test(text_model) + self.update_generation(text_model) text_model.to(torch_device) text_model.eval() @@ -859,6 +870,11 @@ def test_speech_generation(self): output_text = self.factory_generation_speech_test(model, input_text) speech_model = SeamlessM4Tv2ForSpeechToSpeech.from_pretrained(self.tmpdirname) + # Even if this component is loaded after `model.save_pretrained` which is after + # `set_model_for_less_flaky_test(model)`, we still need to apply `set_model_for_less_flaky_test` here as the + # `eps` attribute in the model's norm layers is not set from the config. + set_model_for_less_flaky_test(speech_model) + self.update_generation(speech_model) speech_model.to(torch_device) speech_model.eval() diff --git a/tests/models/tapas/test_modeling_tapas.py b/tests/models/tapas/test_modeling_tapas.py index 4ee159d6bddd1d..05618f4a4efd8c 100644 --- a/tests/models/tapas/test_modeling_tapas.py +++ b/tests/models/tapas/test_modeling_tapas.py @@ -60,9 +60,6 @@ reduce_mean, reduce_sum, ) - from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_12 -else: - is_torch_greater_or_equal_than_1_12 = False class TapasModelTester: @@ -411,7 +408,6 @@ def prepare_config_and_inputs_for_common(self): return config, inputs_dict -@unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+") @require_torch class TapasModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = ( @@ -578,7 +574,6 @@ def prepare_tapas_batch_inputs_for_training(): return table, queries, answer_coordinates, answer_text, float_answer -@unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+") @require_torch class TapasModelIntegrationTest(unittest.TestCase): @cached_property @@ -930,10 +925,6 @@ def test_inference_classification_head(self): self.assertTrue(torch.allclose(outputs.logits, expected_tensor, atol=0.05)) -# Below: tests for Tapas utilities which are defined in modeling_tapas.py. -# These are based on segmented_tensor_test.py of the original implementation. -# URL: https://github.com/google-research/tapas/blob/master/tapas/models/segmented_tensor_test.py -@unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+") @require_torch class TapasUtilitiesTest(unittest.TestCase): def _prepare_tables(self): diff --git a/tests/models/tapas/test_tokenization_tapas.py b/tests/models/tapas/test_tokenization_tapas.py index 0a911f7182b4a0..9a3a2578fd16b3 100644 --- a/tests/models/tapas/test_tokenization_tapas.py +++ b/tests/models/tapas/test_tokenization_tapas.py @@ -23,7 +23,7 @@ import pandas as pd from parameterized import parameterized -from transformers import AddedToken, is_torch_available +from transformers import AddedToken from transformers.models.tapas.tokenization_tapas import ( VOCAB_FILES_NAMES, BasicTokenizer, @@ -45,12 +45,6 @@ from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english, merge_model_tokenizer_mappings -if is_torch_available(): - from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_12 -else: - is_torch_greater_or_equal_than_1_12 = False - - @require_tokenizers @require_pandas class TapasTokenizationTest(TokenizerTesterMixin, unittest.TestCase): @@ -1048,7 +1042,6 @@ def test_token_type_ids(self): # Do the same test as modeling common. self.assertIn(0, output["token_type_ids"][0]) - @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+") @require_torch @slow def test_torch_encode_plus_sent_to_model(self): diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py index 4f501fc10a028f..8286b3c94fb9da 100644 --- a/tests/models/vipllava/test_modeling_vipllava.py +++ b/tests/models/vipllava/test_modeling_vipllava.py @@ -41,8 +41,6 @@ if is_torch_available(): import torch -else: - is_torch_greater_or_equal_than_2_0 = False if is_vision_available(): from PIL import Image diff --git a/tests/pipelines/test_pipelines_table_question_answering.py b/tests/pipelines/test_pipelines_table_question_answering.py index 9481ab200063f8..e2141dc7cc2f66 100644 --- a/tests/pipelines/test_pipelines_table_question_answering.py +++ b/tests/pipelines/test_pipelines_table_question_answering.py @@ -20,7 +20,6 @@ AutoTokenizer, TableQuestionAnsweringPipeline, TFAutoModelForTableQuestionAnswering, - is_torch_available, pipeline, ) from transformers.testing_utils import ( @@ -33,12 +32,6 @@ ) -if is_torch_available(): - from transformers.pytorch_utils import is_torch_greater_or_equal_than_1_12 -else: - is_torch_greater_or_equal_than_1_12 = False - - @is_pipeline_test class TQAPipelineTests(unittest.TestCase): # Putting it there for consistency, but TQA do not have fast tokenizer @@ -150,7 +143,6 @@ def test_small_model_tf(self): }, ) - @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+") @require_torch def test_small_model_pt(self, torch_dtype="float32"): model_id = "lysandre/tiny-tapas-random-wtq" @@ -253,12 +245,10 @@ def test_small_model_pt(self, torch_dtype="float32"): }, ) - @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+") @require_torch def test_small_model_pt_fp16(self): self.test_small_model_pt(torch_dtype="float16") - @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+") @require_torch def test_slow_tokenizer_sqa_pt(self, torch_dtype="float32"): model_id = "lysandre/tiny-tapas-random-sqa" @@ -378,7 +368,6 @@ def test_slow_tokenizer_sqa_pt(self, torch_dtype="float32"): }, ) - @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+") @require_torch def test_slow_tokenizer_sqa_pt_fp16(self): self.test_slow_tokenizer_sqa_pt(torch_dtype="float16") @@ -505,7 +494,6 @@ def test_slow_tokenizer_sqa_tf(self): }, ) - @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+") @slow @require_torch def test_integration_wtq_pt(self, torch_dtype="float32"): @@ -551,7 +539,6 @@ def test_integration_wtq_pt(self, torch_dtype="float32"): ] self.assertListEqual(results, expected_results) - @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+") @slow @require_torch def test_integration_wtq_pt_fp16(self): @@ -606,7 +593,6 @@ def test_integration_wtq_tf(self): ] self.assertListEqual(results, expected_results) - @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+") @slow @require_torch def test_integration_sqa_pt(self, torch_dtype="float32"): @@ -632,7 +618,6 @@ def test_integration_sqa_pt(self, torch_dtype="float32"): ] self.assertListEqual(results, expected_results) - @unittest.skipIf(not is_torch_greater_or_equal_than_1_12, reason="Tapas is only available in torch v1.12+") @slow @require_torch def test_integration_sqa_pt_fp16(self): diff --git a/tests/quantization/vptq_integration/__init__.py b/tests/quantization/vptq_integration/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/tests/quantization/vptq_integration/test_vptq.py b/tests/quantization/vptq_integration/test_vptq.py new file mode 100644 index 00000000000000..faa9a5879d1dcc --- /dev/null +++ b/tests/quantization/vptq_integration/test_vptq.py @@ -0,0 +1,194 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import tempfile +import unittest + +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, VptqConfig +from transformers.testing_utils import ( + require_accelerate, + require_torch_gpu, + require_torch_multi_gpu, + require_vptq, + slow, + torch_device, +) +from transformers.utils import is_accelerate_available, is_torch_available + + +if is_torch_available(): + import torch + +if is_accelerate_available(): + from accelerate import init_empty_weights + + +class VptqConfigTest(unittest.TestCase): + def test_to_dict(self): + """ + Makes sure the config format is properly set + """ + quantization_config = VptqConfig() + vptq_orig_config = quantization_config.to_dict() + + self.assertEqual(quantization_config.quant_config, vptq_orig_config["quant_config"]) + + +@slow +@require_torch_gpu +@require_vptq +@require_accelerate +class VptqTest(unittest.TestCase): + model_name = "VPTQ-community/Meta-Llama-3.1-8B-Instruct-v12-k65536-4096-woft" + + input_text = "Hello my name is" + max_new_tokens = 32 + + EXPECTED_OUTPUT = "Hello my name is Sarah and I am a 25 year old woman from the United States. I am a college graduate and I am currently working as a marketing specialist for a small" + + device_map = "cuda" + + # called only once for all test in this class + @classmethod + def setUpClass(cls): + """ + Setup quantized model + """ + cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name) + cls.quantized_model = AutoModelForCausalLM.from_pretrained( + cls.model_name, + device_map=cls.device_map, + ) + + def tearDown(self): + gc.collect() + torch.cuda.empty_cache() + gc.collect() + + def test_quantized_model(self): + """ + Simple test that checks if the quantized model is working properly + """ + input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + + output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False) + self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT) + + def test_raise_if_non_quantized(self): + model_id = "facebook/opt-125m" + quantization_config = VptqConfig() + + with self.assertRaises(ValueError): + _ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config) + + def test_save_pretrained(self): + """ + Simple test that checks if the quantized model is working properly after being saved and loaded + """ + with tempfile.TemporaryDirectory() as tmpdirname: + self.quantized_model.save_pretrained(tmpdirname) + model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map) + + input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + + output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False) + self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT) + + @require_torch_multi_gpu + def test_quantized_model_multi_gpu(self): + """ + Simple test that checks if the quantized model is working properly with multiple GPUs + """ + input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + + quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto") + + self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1}) + + output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens, do_sample=False) + + self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT) + + def test_quantized_model_conversion(self): + """ + Simple test that checks if the quantized model has been converted properly + """ + from vptq import VQuantLinear + + from transformers.integrations import replace_with_vptq_linear + + model_id = "facebook/opt-350m" + config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5") + modules_to_not_convert = ["lm_head"] + names = [ + "q_proj", + "k_proj", + "v_proj", + "out_proj", + "fc1", + "fc2", + ] + value = { + "enable_norm": True, + "enable_perm": True, + "group_num": 1, + "group_size": 128, + "indices_as_float": False, + "num_centroids": [-1, 128], + "num_res_centroids": [-1, 128], + "outlier_size": 0, + "vector_lens": [-1, 12], + } + shared_layer_config = {} + for name in names: + shared_layer_config[name] = value + for i in range(24): + modules_to_not_convert.append("model.decoder.layers.{layer_idx}.fc1".format(layer_idx=i)) + layer_configs = {} + layer_configs["model.decoder.project_out"] = value + layer_configs["model.decoder.project_in"] = value + quantization_config = VptqConfig(config_for_layers=layer_configs, shared_layer_config=shared_layer_config) + + with init_empty_weights(): + model = AutoModelForCausalLM.from_config(config) + + nb_linears = 0 + for module in model.modules(): + if isinstance(module, torch.nn.Linear): + nb_linears += 1 + + model, _ = replace_with_vptq_linear(model, quantization_config=quantization_config) + nb_vptq_linear = 0 + for module in model.modules(): + if isinstance(module, VQuantLinear): + nb_vptq_linear += 1 + + self.assertEqual(nb_linears - 1, nb_vptq_linear) + + # Try with `linear_weights_not_to_quantize` + with init_empty_weights(): + model = AutoModelForCausalLM.from_config(config) + quantization_config = VptqConfig(config_for_layers=layer_configs, shared_layer_config=shared_layer_config) + model, _ = replace_with_vptq_linear( + model, quantization_config=quantization_config, modules_to_not_convert=modules_to_not_convert + ) + nb_vptq_linear = 0 + for module in model.modules(): + if isinstance(module, VQuantLinear): + nb_vptq_linear += 1 + # 25 comes from 24 decoder.layers.{layer_idx}.fc1 + # and the last lm_head + self.assertEqual(nb_linears - 25, nb_vptq_linear) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index f150477c6231f4..929bbb13a56e80 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -89,6 +89,9 @@ require_torch_multi_accelerator, require_torch_multi_gpu, require_torch_sdpa, + set_config_for_less_flaky_test, + set_model_for_less_flaky_test, + set_model_tester_for_less_flaky_test, slow, torch_device, ) @@ -3976,34 +3979,11 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): def get_mean_reldiff(failcase, x, ref, atol, rtol): return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}" - if hasattr(self.model_tester, "num_hidden_layers"): - self.model_tester.num_hidden_layers = 1 - if hasattr(self.model_tester, "vision_config") and "num_hidden_layers" in self.model_tester.vision_config: - self.model_tester.vision_config = copy.deepcopy(self.model_tester.vision_config) - self.model_tester.vision_config["num_hidden_layers"] = 1 - if hasattr(self.model_tester, "text_config") and "num_hidden_layers" in self.model_tester.text_config: - self.model_tester.text_config = copy.deepcopy(self.model_tester.text_config) - self.model_tester.text_config["num_hidden_layers"] = 1 + set_model_tester_for_less_flaky_test(self) for model_class in self.all_model_classes: config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - config.rms_norm_eps = 1.0 - config.layer_norm_eps = 1.0 - config.norm_eps = 1.0 - config.norm_epsilon = 1.0 - config.layer_norm_epsilon = 1.0 - - # norm layers (layer/group norm, etc.) could cause flaky tests when the tensors have very small variance. - # (We don't need the original epsilon values to check eager/sdpa matches) - for attr in ["text_config", "vision_config", "text_encoder", "audio_encoder", "decoder"]: - if hasattr(config, attr): - getattr(config, attr).rms_norm_eps = 1.0 - getattr(config, attr).layer_norm_eps = 1.0 - getattr(config, attr).norm_eps = 1.0 - getattr(config, attr).norm_epsilon = 1.0 - getattr(config, attr).layer_norm_epsilon = 1.0 - + set_config_for_less_flaky_test(config) model = model_class(config) # FIXME: we deactivate boolean mask for models using "use_mask_token" in their constructors. # These models support masking only in the case `use_mask_token=True`. Otherwise they cannot consume an input mask. @@ -4029,13 +4009,8 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): ) model_eager = model_eager.eval().to(torch_device, dtype=torch_dtype) - # Another way to make sure norm layers have desired epsilon. (Some models don't set it from its config.) - for x in model_eager.modules(): - if isinstance(x, (nn.LayerNorm, nn.GroupNorm)): - x.eps = 1.0 - for x in model_sdpa.modules(): - if isinstance(x, (nn.LayerNorm, nn.GroupNorm)): - x.eps = 1.0 + set_model_for_less_flaky_test(model_eager) + set_model_for_less_flaky_test(model_sdpa) # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 16 times the model, # but it would be nicer to have an efficient way to use parameterized.expand diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py index 906e85e1de61a5..c641ccb21e2984 100644 --- a/utils/tests_fetcher.py +++ b/utils/tests_fetcher.py @@ -995,9 +995,7 @@ def _print_list(l) -> str: def infer_tests_to_run( - output_file: str, - diff_with_last_commit: bool = False, - filter_models: bool = False, + output_file: str, diff_with_last_commit: bool = False, filter_models: bool = False, test_all: bool = False ): """ The main function called by the test fetcher. Determines the tests to run from the diff. @@ -1018,7 +1016,11 @@ def infer_tests_to_run( Whether or not to filter the tests to core models only, when a file modified results in a lot of model tests. """ - modified_files = get_modified_python_files(diff_with_last_commit=diff_with_last_commit) + if not test_all: + modified_files = get_modified_python_files(diff_with_last_commit=diff_with_last_commit) + else: + modified_files = [str(k) for k in PATH_TO_TESTS.glob("*/*") if str(k).endswith(".py") and "test_" in str(k)] + print("\n### test_all is TRUE, FETCHING ALL FILES###\n") print(f"\n### MODIFIED FILES ###\n{_print_list(modified_files)}") # Create the map that will give us all impacted modules. @@ -1230,5 +1232,6 @@ def create_test_list_from_filter(full_test_list, out_path): args.output_file, diff_with_last_commit=diff_with_last_commit, filter_models=False, + test_all=commit_flags["test_all"], ) filter_tests(args.output_file, ["repo_utils"])