diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py index 7ccf5ec96cec4f..84c0f65166baef 100644 --- a/.circleci/create_circleci_config.py +++ b/.circleci/create_circleci_config.py @@ -133,7 +133,7 @@ def to_dict(self): "command": """dpkg-query --show --showformat='${Installed-Size}\t${Package}\n' | sort -rh | head -25 | sort -h | awk '{ package=$2; sub(".*/", "", package); printf("%.5f GB %s\n", $1/1024/1024, package)}' || true"""} }, {"run": {"name": "Create `test-results` directory", "command": "mkdir test-results"}}, - {"run": {"name": "Get files to test", "command":f'curl -L -o {self.job_name}_test_list.txt <>' if self.name != "pr_documentation_tests" else 'echo "Skipped"'}}, + {"run": {"name": "Get files to test", "command":f'curl -L -o {self.job_name}_test_list.txt <> --header "Circle-Token: $CIRCLE_TOKEN"' if self.name != "pr_documentation_tests" else 'echo "Skipped"'}}, {"run": {"name": "Split tests across parallel nodes: show current parallel tests", "command": f"TESTS=$(circleci tests split --split-by=timings {self.job_name}_test_list.txt) && echo $TESTS > splitted_tests.txt && echo $TESTS | tr ' ' '\n'" if self.parallelism else f"awk '{{printf \"%s \", $0}}' {self.job_name}_test_list.txt > splitted_tests.txt" } @@ -371,9 +371,14 @@ def create_circleci_config(folder=None): **{j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs}, **{j.job_name + "_parallelism":{"type":"integer", "default":1} for j in jobs}, }, - "jobs" : {j.job_name: j.to_dict() for j in jobs}, - "workflows": {"version": 2, "run_tests": {"jobs": [j.job_name for j in jobs]}} + "jobs" : {j.job_name: j.to_dict() for j in jobs} } + if "CIRCLE_TOKEN" in os.environ: + # For private forked repo. (e.g. new model addition) + config["workflows"] = {"version": 2, "run_tests": {"jobs": [{j.job_name: {"context": ["TRANSFORMERS_CONTEXT"]}} for j in jobs]}} + else: + # For public repo. (e.g. `transformers`) + config["workflows"] = {"version": 2, "run_tests": {"jobs": [j.job_name for j in jobs]}} with open(os.path.join(folder, "generated_config.yml"), "w") as f: f.write(yaml.dump(config, sort_keys=False, default_flow_style=False).replace("' << pipeline", " << pipeline").replace(">> '", " >>")) diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile index 53e66662f9ee99..089be4a4460101 100755 --- a/docker/transformers-quantization-latest-gpu/Dockerfile +++ b/docker/transformers-quantization-latest-gpu/Dockerfile @@ -9,7 +9,7 @@ SHELL ["sh", "-lc"] # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant # to be used as arguments for docker build (so far). -ARG PYTORCH='2.4.1' +ARG PYTORCH='2.5.1' # Example: `cu102`, `cu113`, etc. ARG CUDA='cu118' @@ -36,12 +36,17 @@ RUN python3 -m pip install --no-cache-dir einops # Add bitsandbytes for mixed int8 testing RUN python3 -m pip install --no-cache-dir bitsandbytes -# Add auto-gptq for gtpq quantization testing -RUN python3 -m pip install --no-cache-dir auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ +# Add auto-gptq for gtpq quantization testing, installed from source for pytorch==2.5.1 compatibility +# TORCH_CUDA_ARCH_LIST="7.5+PTX" is added to make the package compile for Tesla T4 gpus available for the CI. +RUN pip install gekko +RUN git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ && TORCH_CUDA_ARCH_LIST="7.5+PTX" python3 setup.py install # Add optimum for gptq quantization testing RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum +# Add PEFT +RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/peft@main#egg=peft + # Add aqlm for quantization testing RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2 @@ -52,8 +57,8 @@ RUN python3 -m pip install --no-cache-dir hqq RUN python3 -m pip install --no-cache-dir gguf # Add autoawq for quantization testing -# >=v0.2.3 needed for compatibility with torch 2.2.1 -RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+cu118-cp310-cp310-linux_x86_64.whl +# >=v0.2.7 needed for compatibility with transformers > 4.46 +RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.7.post2/autoawq-0.2.7.post2-py3-none-any.whl # Add quanto for quantization testing RUN python3 -m pip install --no-cache-dir optimum-quanto diff --git a/docs/source/ar/_toctree.yml b/docs/source/ar/_toctree.yml index d9523eaf5da535..1208153c22df68 100644 --- a/docs/source/ar/_toctree.yml +++ b/docs/source/ar/_toctree.yml @@ -129,10 +129,10 @@ title: التصدير إلى TFLite - local: torchscript title: التصدير إلى TorchScript -# - local: benchmarks -# title: المعايير -# - local: notebooks -# title: دفاتر الملاحظات مع الأمثلة + - local: benchmarks + title: المعايير + - local: notebooks + title: دفاتر الملاحظات مع الأمثلة # - local: community # title: موارد المجتمع - local: troubleshooting diff --git a/docs/source/ar/benchmarks.md b/docs/source/ar/benchmarks.md new file mode 100644 index 00000000000000..71e1829e643350 --- /dev/null +++ b/docs/source/ar/benchmarks.md @@ -0,0 +1,352 @@ +# معايير الأداء + + +أدوات قياس الأداء من Hugging Face أصبحت قديمة،ويُنصح باستخدام مكتبات خارجية لقياس سرعة وتعقيد الذاكرة لنماذج Transformer. + + + +[[open-in-colab]] + +لنلق نظرة على كيفية تقييم أداء نماذج 🤗 Transformers، وأفضل الممارسات، ومعايير الأداء المتاحة بالفعل. + +يُمكن العثور على دفتر ملاحظات يشرح بالتفصيل كيفية قياس أداء نماذج 🤗 Transformers [هنا](https://github.com/huggingface/notebooks/tree/main/examples/benchmark.ipynb). + +## كيفية قياس أداء نماذج 🤗 Transformers + +تسمح الفئتان [`PyTorchBenchmark`] و [`TensorFlowBenchmark`] بتقييم أداء نماذج 🤗 Transformers بمرونة. تتيح لنا فئات التقييم قياس الأداء قياس _الاستخدام الأقصى للذاكرة_ و _الوقت اللازم_ لكل من _الاستدلال_ و _التدريب_. + + + +هنا، ييُعرَّف _الاستدلال_ بأنه تمريرة أمامية واحدة، ويتم تعريف _التدريب_ بأنه تمريرة أمامية واحدة وتمريرة خلفية واحدة. + + + +تتوقع فئات تقييم الأداء [`PyTorchBenchmark`] و [`TensorFlowBenchmark`] كائنًا من النوع [`PyTorchBenchmarkArguments`] و [`TensorFlowBenchmarkArguments`]، على التوالي، للتنفيذ. [`PyTorchBenchmarkArguments`] و [`TensorFlowBenchmarkArguments`] هي فئات بيانات وتحتوي على جميع التكوينات ذات الصلة لفئة تقييم الأداء المقابلة. في المثال التالي، يتم توضيح كيفية تقييم أداء نموذج BERT من النوع _bert-base-cased_. + + + + +```py +>>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments + +>>> args = PyTorchBenchmarkArguments(models=["google-bert/bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]) +>>> benchmark = PyTorchBenchmark(args) +``` + + + +```py +>>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments + +>>> args = TensorFlowBenchmarkArguments( +... models=["google-bert/bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] +... ) +>>> benchmark = TensorFlowBenchmark(args) +``` + + + +هنا، يتم تمرير ثلاثة معامﻻت إلى فئات بيانات حجة قياس الأداء، وهي `models` و `batch_sizes` و `sequence_lengths`. المعامل `models` مطلوبة وتتوقع `قائمة` من بمعرّفات النموذج من [مركز النماذج](https://huggingface.co/models) تحدد معامﻻت القائمة `batch_sizes` و `sequence_lengths` حجم `input_ids` الذي يتم قياس أداء النموذج عليه. هناك العديد من المعلمات الأخرى التي يمكن تكوينها عبر فئات بيانات معال قياس الأداء. لمزيد من التفاصيل حول هذه المعلمات، يمكنك إما الرجوع مباشرة إلى الملفات `src/transformers/benchmark/benchmark_args_utils.py`، `src/transformers/benchmark/benchmark_args.py` (لـ PyTorch) و `src/transformers/benchmark/benchmark_args_tf.py` (لـ Tensorflow). أو، بدلاً من ذلك، قم بتشغيل أوامر shell التالية من المجلد الرئيسي لطباعة قائمة وصفية بجميع المعلمات القابلة للتكوين لـ PyTorch و Tensorflow على التوالي. + + + + +```bash +python examples/pytorch/benchmarking/run_benchmark.py --help +``` + +يُمكن ببساطة تشغيل كائن التقييم الذي تم تهيئته عن طريق استدعاء `benchmark.run()`. + +```py +>>> results = benchmark.run() +>>> print(results) +==================== INFERENCE - SPEED - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Time in s +-------------------------------------------------------------------------------- +google-bert/bert-base-uncased 8 8 0.006 +google-bert/bert-base-uncased 8 32 0.006 +google-bert/bert-base-uncased 8 128 0.018 +google-bert/bert-base-uncased 8 512 0.088 +-------------------------------------------------------------------------------- + +==================== INFERENCE - MEMORY - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Memory in MB +-------------------------------------------------------------------------------- +google-bert/bert-base-uncased 8 8 1227 +google-bert/bert-base-uncased 8 32 1281 +google-bert/bert-base-uncased 8 128 1307 +google-bert/bert-base-uncased 8 512 1539 +-------------------------------------------------------------------------------- + +==================== ENVIRONMENT INFORMATION ==================== + +- transformers_version: 2.11.0 +- framework: PyTorch +- use_torchscript: False +- framework_version: 1.4.0 +- python_version: 3.6.10 +- system: Linux +- cpu: x86_64 +- architecture: 64bit +- date: 2020-06-29 +- time: 08:58:43.371351 +- fp16: False +- use_multiprocessing: True +- only_pretrain_model: False +- cpu_ram_mb: 32088 +- use_gpu: True +- num_gpus: 1 +- gpu: TITAN RTX +- gpu_ram_mb: 24217 +- gpu_power_watts: 280.0 +- gpu_performance_state: 2 +- use_tpu: False +``` + + + +```bash +python examples/tensorflow/benchmarking/run_benchmark_tf.py --help +``` + +يُمكن بعد ذلك تشغيل كائن قياس الأداء الذي تم تهيئته عن طريق استدعاء `benchmark.run()`. + +```py +>>> results = benchmark.run() +>>> print(results) +>>> results = benchmark.run() +>>> print(results) +==================== INFERENCE - SPEED - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Time in s +-------------------------------------------------------------------------------- +google-bert/bert-base-uncased 8 8 0.005 +google-bert/bert-base-uncased 8 32 0.008 +google-bert/bert-base-uncased 8 128 0.022 +google-bert/bert-base-uncased 8 512 0.105 +-------------------------------------------------------------------------------- + +==================== INFERENCE - MEMORY - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Memory in MB +-------------------------------------------------------------------------------- +google-bert/bert-base-uncased 8 8 1330 +google-bert/bert-base-uncased 8 32 1330 +google-bert/bert-base-uncased 8 128 1330 +google-bert/bert-base-uncased 8 512 1770 +-------------------------------------------------------------------------------- + +==================== ENVIRONMENT INFORMATION ==================== + +- transformers_version: 202.11.0 +- framework: Tensorflow +- use_xla: False +- framework_version: 2.2.0 +- python_version: 3.6.10 +- system: Linux +- cpu: x86_64 +- architecture: 64bit +- date: 2020-06-29 +- time: 09:26:35.617317 +- fp16: False +- use_multiprocessing: True +- only_pretrain_model: False +- cpu_ram_mb: 32088 +- use_gpu: True +- num_gpus: 1 +- gpu: TITAN RTX +- gpu_ram_mb: 24217 +- gpu_power_watts: 280.0 +- gpu_performance_state: 2 +- use_tpu: False +``` + + + +بشكل افتراضي، يتم تقييم _الوقت_ و _الذاكرة المطلوبة_ لـ _الاستدلال_. في مثال المخرجات أعلاه، يُظهر القسمان الأولان النتيجة المقابلة لـ _وقت الاستدلال_ و _ذاكرة الاستدلال_. بالإضافة إلى ذلك، يتم طباعة جميع المعلومات ذات الصلة حول بيئة الحوسبة، على سبيل المثال نوع وحدة معالجة الرسومات (GPU)، والنظام، وإصدارات المكتبة، وما إلى ذلك، في القسم الثالث تحت _معلومات البيئة_. يمكن حفظ هذه المعلومات بشكل اختياري في ملف _.csv_ عند إضافة المعامل `save_to_csv=True` إلى [`PyTorchBenchmarkArguments`] و [`TensorFlowBenchmarkArguments`] على التوالي. في هذه الحالة، يتم حفظ كل قسم في ملف _.csv_ منفصل. يمكن اختيارًا تحديد مسار كل ملف _.csv_ عبر فئات بيانات معامل قياس الأداء. + +بدلاً من تقييم النماذج المدربة مسبقًا عبر معرّف النموذج، على سبيل المثال `google-bert/bert-base-uncased`، يُمكن للمستخدم بدلاً من ذلك قياس أداء تكوين عشوائي لأي فئة نموذج متاحة. في هذه الحالة، يجب إدراج "قائمة" من التكوينات مع معامل قياس الأداء كما هو موضح أدناه. + + + + +```py +>>> from transformers import PyTorchBenchmark، PyTorchBenchmarkArguments، BertConfig + +>>> args = PyTorchBenchmarkArguments( +... models=["bert-base"، "bert-384-hid"، "bert-6-lay"]، batch_sizes=[8]، sequence_lengths=[8، 32، 128، 512] +... ) +>>> config_base = BertConfig() +>>> config_384_hid = BertConfig(hidden_size=384) +>>> config_6_lay = BertConfig(num_hidden_layers=6) + +>>> benchmark = PyTorchBenchmark(args، configs=[config_base، config_384_hid، config_6_lay]) +>>> benchmark.run() +==================== INFERENCE - SPEED - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Time in s +-------------------------------------------------------------------------------- +bert-base 8 128 0.006 +bert-base 8 512 0.006 +bert-base 8 128 0.018 +bert-base 8 512 0.088 +bert-384-hid 8 8 0.006 +bert-384-hid 8 32 0.006 +bert-384-hid 8 128 0.011 +bert-384-hid 8 512 0.054 +bert-6-lay 8 8 0.003 +bert-6-lay 8 32 0.004 +bert-6-lay 8 128 0.009 +bert-6-lay 8 512 0.044 +-------------------------------------------------------------------------------- + +==================== INFERENCE - MEMORY - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Memory in MB +## نتائج اختبار الأداء + +في هذا القسم، يتم قياس _وقت الاستدلال_ و _الذاكرة المطلوبة_ للاستدلال، لمختلف تكوينات `BertModel`. يتم عرض النتائج في جدول، مع تنسيق مختلف قليلاً لكل من PyTorch و TensorFlow. + +-------------------------------------------------------------------------------- +| اسم النموذج | حجم الدفعة | طول التسلسل | الذاكرة بالميغابايت | +-------------------------------------------------------------------------------- +| bert-base | 8 | 8 | 1277 | +| bert-base | 8 | 32 | 1281 | +| bert-base | 8 | 128 | 1307 | +| bert-base | 8 | 512 | 1539 | +| bert-384-hid | 8 | 8 | 1005 | +| bert-384-hid | 8 | 32 | 1027 | +| bert-384-hid | 8 | 128 | 1035 | +| bert-384-hid | 8 | 512 | 1255 | +| bert-6-lay | 8 | 8 | 1097 | +| bert-6-lay | 8 | 32 | 1101 | +| bert-6-lay | 8 | 128 | 1127 | +| bert-6-lay | 8 | 512 | 1359 | +-------------------------------------------------------------------------------- + +==================== معلومات البيئة ==================== + +- transformers_version: 2.11.0 +- framework: PyTorch +- use_torchscript: False +- framework_version: 1.4.0 +- python_version: 3.6.10 +- system: Linux +- cpu: x86_64 +- architecture: 64bit +- date: 2020-06-29 +- time: 09:35:25.143267 +- fp16: False +- use_multiprocessing: True +- only_pretrain_model: False +- cpu_ram_mb: 32088 +- use_gpu: True +- num_gpus: 1 +- gpu: TITAN RTX +- gpu_ram_mb: 24217 +- gpu_power_watts: 280.0 +- gpu_performance_state: 2 +- use_tpu: False +``` + + + +```py +>>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments, BertConfig + +>>> args = TensorFlowBenchmarkArguments( +... models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] +... ) +>>> config_base = BertConfig() +>>> config_384_hid = BertConfig(hidden_size=384) +>>> config_6_lay = BertConfig(num_hidden_layers=6) + +>>> benchmark = TensorFlowBenchmark(args, configs=[config_base, config_384_hid, config_6_lay]) +>>> benchmark.run() +==================== نتائج السرعة في الاستدلال ==================== +-------------------------------------------------------------------------------- +| اسم النموذج | حجم الدفعة | طول التسلسل | الوقت بالثانية | +-------------------------------------------------------------------------------- +| bert-base | 8 | 8 | 0.005 | +| bert-base | 8 | 32 | 0.008 | +| bert-base | 8 | 128 | 0.022 | +| bert-base | 8 | 512 | 0.106 | +| bert-384-hid | 8 | 8 | 0.005 | +| bert-384-hid | 8 | 32 | 0.007 | +| bert-384-hid | 8 | 128 | 0.018 | +| bert-384-hid | 8 | 512 | 0.064 | +| bert-6-lay | 8 | 8 | 0.002 | +| bert-6-lay | 8 | 32 | 0.003 | +| bert-6-lay | 8 | 128 | 0.0011 | +| bert-6-lay | 8 | 512 | 0.074 | +-------------------------------------------------------------------------------- + +==================== نتائج الذاكرة في الاستدلال ==================== +-------------------------------------------------------------------------------- +| اسم النموذج | حجم الدفعة | طول التسلسل | الذاكرة بالميغابايت | +-------------------------------------------------------------------------------- +| اسم النموذج | حجم الدفعة | طول التسلسل | الذاكرة بالميغابايت | +-------------------------------------------------------------------------------- +| bert-base | 8 | 8 | 1330 | +| bert-base | 8 | 32 | 1330 | +| bert-base | 8 | 128 | 1330 | +| bert-base | 8 | 512 | 1770 | +| bert-384-hid | 8 | 8 | 1330 | +| bert-384-hid | 8 | 32 | 1330 | +| bert-384-hid | 8 | 128 | 1330 | +| bert-384-hid | 8 | 512 | 1540 | +| bert-6-lay | 8 | 8 | 1330 | +| bert-6-lay | 8 | 32 | 1330 | +| bert-6-lay | 8 | 128 | 1330 | +| bert-6-lay | 8 | 512 | 1540 | +-------------------------------------------------------------------------------- + +==================== معلومات البيئة ==================== + +- transformers_version: 2.11.0 +- framework: Tensorflow +- use_xla: False +- framework_version: 2.2.0 +- python_version: 3.6.10 +- system: Linux +- cpu: x86_64 +- architecture: 64bit +- date: 2020-06-29 +- time: 09:38:15.487125 +- fp16: False +- use_multiprocessing: True +- only_pretrain_model: False +- cpu_ram_mb: 32088 +- use_gpu: True +- num_gpus: 1 +- gpu: TITAN RTX +- gpu_ram_mb: 24217 +- gpu_power_watts: 280.0 +- gpu_performance_state: 2 +- use_tpu: False +``` + + + +مرة أخرى، يتم قياس _وقت الاستدلال_ و _الذاكرة المطلوبة_ للاستدلال، ولكن هذه المرة لتكوينات مخصصة لـ `BertModel`. يمكن أن تكون هذه الميزة مفيدة بشكل خاص عند اتخاذ قرار بشأن التكوين الذي يجب تدريب النموذج عليه. + +## أفضل الممارسات في اختبار الأداء + +يسرد هذا القسم بعض أفضل الممارسات التي يجب مراعاتها عند إجراء اختبار الأداء لنموذج ما. + +- حالياً، يتم دعم اختبار الأداء على جهاز واحد فقط. عند إجراء الاختبار على وحدة معالجة الرسوميات (GPU)، يوصى بأن يقوم المستخدم بتحديد الجهاز الذي يجب تشغيل التعليمات البرمجية عليه من خلال تعيين متغير البيئة `CUDA_VISIBLE_DEVICES` في الشل، على سبيل المثال `export CUDA_VISIBLE_DEVICES=0` قبل تشغيل التعليمات البرمجية. +- يجب تعيين الخيار `no_multi_processing` إلى `True` فقط لأغراض الاختبار والتصحيح. ولضمان قياس الذاكرة بدقة، يوصى بتشغيل كل اختبار ذاكرة في عملية منفصلة والتأكد من تعيين `no_multi_processing` إلى `True`. +- يجب دائمًا ذكر معلومات البيئة عند مشاركة نتائج تقييم النموذج. يُمكن أن تختلف النتائج اختلافًا كبيرًا بين أجهزة GPU المختلفة وإصدارات المكتبات، وما إلى ذلك، لذلك فإن نتائج الاختبار بمفردها ليست مفيدة جدًا للمجتمع. + +## مشاركة نتائج اختبار الأداء الخاص بك + +في السابق، تم إجراء اختبار الأداء لجميع النماذج الأساسية المتاحة (10 في ذلك الوقت) لقياس _وقت الاستدلال_، عبر العديد من الإعدادات المختلفة: باستخدام PyTorch، مع TorchScript وبدونها، باستخدام TensorFlow، مع XLA وبدونه. تم إجراء جميع هذه الاختبارات على وحدات المعالجة المركزية (CPU) (باستثناء XLA TensorFlow) ووحدات معالجة الرسوميات (GPU). + +يتم شرح هذا النهج بالتفصيل في [منشور المدونة هذا](https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2) وتتوفر النتائج [هنا](https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing). + +مع أدوات اختبار الأداء الجديدة، أصبح من الأسهل من أي وقت مضى مشاركة نتائج اختبار الأداء الخاص بك مع المجتمع: + +- [نتائج اختبار الأداء في PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/benchmarking/README.md). +- [نتائج اختبار الأداء في TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/benchmarking/README.md). diff --git a/docs/source/ar/installation.md b/docs/source/ar/installation.md index ac5962ec8589e8..d3bd4c655b6038 100644 --- a/docs/source/ar/installation.md +++ b/docs/source/ar/installation.md @@ -144,7 +144,7 @@ conda install conda-forge::transformers تُحمّل النماذج المُسبقة التدريب وتُخزّن مؤقتًا في: `~/.cache/huggingface/hub`. هذا هو المجلد الافتراضي الذي يُحدده متغير البيئة `TRANSFORMERS_CACHE`. على Windows، يكون دليل ذاكرة التخزين المؤقت الافتراضي هو `C:\Users\username\.cache\huggingface\hub`. يمكنك تغيير متغيرات البيئة shell الموضحة أدناه - حسب الأولوية - لتحديد دليل ذاكرة تخزين مؤقت مختلف: -1. متغير البيئة (افتراضي): `HUGGINGFACE_HUB_CACHE` أو `TRANSFORMERS_CACHE`. +1. متغير البيئة (افتراضي): `HF_HUB_CACHE` أو `TRANSFORMERS_CACHE`. 2. متغير البيئة: `HF_HOME`. 3. متغير البيئة: `XDG_CACHE_HOME` + `/huggingface`. diff --git a/docs/source/ar/notebooks.md b/docs/source/ar/notebooks.md new file mode 100644 index 00000000000000..0591204d602c7e --- /dev/null +++ b/docs/source/ar/notebooks.md @@ -0,0 +1,141 @@ +# دفاتر ملاحظات 🤗 Transformers + +يمكنك أن تجد هنا قائمة بدفاتر الملاحظات الرسمية التي تقدمها Hugging Face. + +كما نود أن ندرج هنا محتوى مثيرًا للاهتمام تم إنشاؤه بواسطة المجتمع. +إذا كتبت دفتر ملاحظات يستفيد من 🤗 Transformers وتود إدراجه هنا، فيُرجى فتح طلب سحب حتى يمكن تضمينه ضمن دفاتر ملاحظات المجتمع. + + +## دفاتر ملاحظات Hugging Face 🤗 + +### دفاتر ملاحظات التوثيق + +يمكنك فتح أي صفحة من صفحات التوثيق كدفتر ملاحظات في Colab (يوجد زر مباشرة على تلك الصفحات) ولكنها مدرجة هنا أيضًا إذا كنت بحاجة إليها: + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [جولة سريعة في المكتبة](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/quicktour.ipynb) | عرض لمختلف واجهات برمجة التطبيقات في Transformers |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/quicktour.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/en/transformers_doc/quicktour.ipynb)| +| [ملخص المهام](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb) | كيفية تشغيل نماذج مكتبة Transformers مهمة تلو الأخرى |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb)| +| [معالجة البيانات مسبقًا](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb) | كيفية استخدام محلل لغوي لمعالجة بياناتك مسبقًا |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb)| +| [الضبط الدقيق لنموذج مُدرَّب مسبقًا](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb) | كيفية استخدام المدرب لضبط نموذج مُدرَّب مسبقًا بدقة |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)| +| [ملخص للمحللات اللغوية](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb) | الاختلافات بين خوارزمية المحلل اللغوي |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)| +| [النماذج متعددة اللغات](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb) | كيفية استخدام النماذج متعددة اللغات للمكتبة |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)| + + +### أمثلة PyTorch + +#### معالجة اللغة الطبيعية[[pytorch-nlp]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [تدريب محللك اللغوي](https://github.com/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb) | كيفية تدريب واستخدام محللك اللغوي الخاص بك |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)| +| [تدريب نموذج لغتك](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb) | كيفية البدء بسهولة في استخدام المحولات |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف النص](https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على أي مهمة GLUE. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)| +| [كيفية ضبط نموذج بدقة على النمذجة اللغوية](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على مهمة LM سببية أو مقنعة. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف الرموز المميزة](https://github.com/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على مهمة تصنيف الرموز المميزة (NER، PoS). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| +| [كيفية ضبط نموذج بدقة على الإجابة على الأسئلة](https://github.com/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على SQUAD. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| +| [كيفية ضبط نموذج بدقة على الاختيار من متعدد](https://github.com/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على SWAG. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| +| [كيفية ضبط نموذج بدقة على الترجمة](https://github.com/huggingface/notebooks/blob/main/examples/translation.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على WMT. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/translation.ipynb)| +| [كيفية ضبط نموذج بدقة على التلخيص](https://github.com/huggingface/notebooks/blob/main/examples/summarization.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على XSUM. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)| +| [كيفية تدريب نموذج لغة من البداية](https://github.com/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| تسليط الضوء على جميع الخطوات لتدريب نموذج Transformer بشكل فعال على بيانات مخصصة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| +| [كيفية إنشاء نص](https://github.com/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| كيفية استخدام أساليب فك التشفير المختلفة لإنشاء اللغة باستخدام المحولات | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| +| [كيفية إنشاء نص (مع قيود)](https://github.com/huggingface/blog/blob/main/notebooks/53_constrained_beam_search.ipynb)| كيفية توجيه إنشاء اللغة باستخدام القيود التي يوفرها المستخدم | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/53_constrained_beam_search.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/53_constrained_beam_search.ipynb)| +| [Reformer](https://github.com/huggingface/blog/blob/main/notebooks/03_reformer.ipynb)| كيف يدفع Reformer حدود النمذجة اللغوية | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/blog/blob/main/notebooks/03_reformer.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/patrickvonplaten/blog/blob/main/notebooks/03_reformer.ipynb)| + +#### رؤية الكمبيوتر[[pytorch-cv]] + +| دفتر الملاحظات | الوصف | | | +|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------:| +| [كيفية ضبط نموذج بدقة على تصنيف الصور (Torchvision)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification.ipynb) | يوضح كيفية معالجة البيانات مسبقًا باستخدام Torchvision وضبط أي نموذج رؤية مُدرَّب مسبقًا بدقة على تصنيف الصور | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف الصور (Albumentations)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) | يوضح كيفية معالجة البيانات مسبقًا باستخدام Albumentations وضبط أي نموذج رؤية مُدرَّب مسبقًا بدقة على تصنيف الصور | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف الصور (Kornia)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb) | يوضح كيفية معالجة البيانات مسبقًا باستخدام Kornia وضبط أي نموذج رؤية مُدرَّب مسبقًا بدقة على تصنيف الصور | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)| +| [كيفية إجراء الكشف عن الأشياء بدون لقطات مع OWL-ViT](https://github.com/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb) | يوضح كيفية إجراء الكشف عن الأشياء بدون لقطات على الصور باستخدام استعلامات نصية | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)| +| [كيفية ضبط نموذج وصف الصور بدقة](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb) | يوضح كيفية ضبط BLIP بدقة لوصف الصور على مجموعة بيانات مخصصة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb)| +| [كيفية بناء نظام تشابه الصور مع Transformers](https://github.com/huggingface/notebooks/blob/main/examples/image_similarity.ipynb) | يوضح كيفية بناء نظام تشابه الصور | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_similarity.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_similarity.ipynb)| +| [كيفية ضبط نموذج SegFormer بدقة على التجزئة الدلالية](https://github.com/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb) | يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج SegFormer مُدرَّب مسبقًا بدقة على التجزئة الدلالية | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb)| +| [كيفية ضبط نموذج VideoMAE بدقة على تصنيف الفيديو](https://github.com/huggingface/notebooks/blob/main/examples/video_classification.ipynb) | يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج VideoMAE مُدرَّب مسبقًا بدقة على تصنيف الفيديو | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb)| + + +#### الصوت[[pytorch-audio]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [كيفية ضبط نموذج التعرف على الكلام باللغة الإنجليزية بدقة](https://github.com/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج كلام مُدرَّب مسبقًا بدقة على TIMIT | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb)| +| [كيفية ضبط نموذج التعرف على الكلام بأي لغة بدقة](https://github.com/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج كلام مُدرَّب مسبقًا متعدد اللغات بدقة على Common Voice | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف الصوت](https://github.com/huggingface/notebooks/blob/main/examples/audio_classification.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج كلام مُدرَّب مسبقًا بدقة على Keyword Spotting | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb)| + + +#### التسلسلات البيولوجية[[pytorch-bio]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:----------------------------------------------------------------------------------------|:-------------|------:| +| [كيفية ضبط نموذج بروتين مُدرَّب مسبقًا بدقة](https://github.com/huggingface/notebooks/blob/main/examples/protein_language_modeling.ipynb) | شاهد كيفية ترميز البروتينات وضبط نموذج "لغة" بروتين مُدرَّب مسبقًا كبير بدقة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_language_modeling.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/protein_language_modeling.ipynb) | +| [كيفية إنشاء طيات بروتينية](https://github.com/huggingface/notebooks/blob/main/examples/protein_folding.ipynb) | شاهد كيفية الانتقال من تسلسل البروتين إلى نموذج بروتين كامل وملف PDB | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_folding.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/protein_folding.ipynb) | +| [كيفية ضبط نموذج محول النيوكليوتيدات بدقة](https://github.com/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) | شاهد كيفية ترميز الحمض النووي وضبط نموذج "لغة" الحمض النووي مُدرَّب مسبقًا كبير بدقة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) | +| [ضبط نموذج محول النيوكليوتيدات بدقة باستخدام LoRA](https://github.com/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) | تدريب نماذج DNA أكبر بكثير بطريقة فعالة من حيث الذاكرة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) | + + +#### طرائق أخرى[[pytorch-other]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:----------------------------------------------------------------------------------------|:-------------|------:| +| [التنبؤ الاحتمالي بالسلاسل الزمنية](https://github.com/huggingface/notebooks/blob/main/examples/time-series-transformers.ipynb) | شاهد كيفية تدريب Time Series Transformer على مجموعة بيانات مخصصة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/time-series-transformers.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/time-series-transformers.ipynb) | + +#### دفاتر ملاحظات الأدوات المساعدة [[pytorch-utility]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [كيفية تصدير النموذج إلى ONNX](https://github.com/huggingface/notebooks/blob/main/examples/onnx-export.ipynb)| تسليط الضوء على كيفية التصدير وتشغيل أعباء عمل الاستدلال من خلال ONNX | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/onnx-export.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/onnx-export.ipynb)| +| [كيفية استخدام المعايير](https://github.com/huggingface/notebooks/blob/main/examples/benchmark.ipynb)| كيفية قياس أداء النماذج باستخدام المحولات | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/benchmark.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/benchmark.ipynb)| + +### أمثلة TensorFlow + +#### معالجة اللغة الطبيعية[[tensorflow-nlp]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [تدريب محللك اللغوي](https://github.com/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb) | كيفية تدريب واستخدام محللك اللغوي الخاص بك |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)| +| [تدريب نموذج لغتك](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch-tf.ipynb) | كيفية البدء بسهولة في استخدام المحولات |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch-tf.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف النص](https://github.com/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على أي مهمة GLUE. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb)| +| [كيفية ضبط نموذج بدقة على النمذجة اللغوية](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على مهمة LM سببية أو مقنعة. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف الرموز المميزة](https://github.com/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على مهمة تصنيف الرموز المميزة (NER، PoS). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb)| +| [كيفية ضبط نموذج بدقة على الإجابة على الأسئلة](https://github.com/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على SQUAD. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb)| +| [كيفية ضبط نموذج بدقة على الاختيار من متعدد](https://github.com/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على SWAG. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb)| +| [كيفية ضبط نموذج بدقة على الترجمة](https://github.com/huggingface/notebooks/blob/main/examples/translation-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على WMT. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb)| +| [كيفية ضبط نموذج بدقة على التلخيص](https://github.com/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على XSUM. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb)| + +#### رؤية الكمبيوتر[[tensorflow-cv]] + +| دفتر الملاحظات | الوصف | | | +|:---------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------|:-------------|------:| +| [كيفية ضبط نموذج بدقة على تصنيف الصور](https://github.com/huggingface/notebooks/blob/main/examples/image_classification-tf.ipynb) | يوضح كيفية معالجة البيانات مسبقًا وضبط أي نموذج رؤية مُدرَّب مسبقًا بدقة على تصنيف الصور | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification-tf.ipynb)| +| [كيفية ضبط نموذج SegFormer بدقة على التجزئة الدلالية](https://github.com/huggingface/notebooks/blob/main/examples/semantic_segmentation-tf.ipynb) | يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج SegFormer مُدرَّب مسبقًا بدقة على التجزئة الدلالية | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/semantic_segmentation-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/semantic_segmentation-tf.ipynb)| + +#### التسلسلات البيولوجية[[tensorflow-bio]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [كيفية ضبط نموذج بروتين مُدرَّب مسبقًا بدقة](https://github.com/huggingface/notebooks/blob/main/examples/protein_language_modeling-tf.ipynb) | شاهد كيفية ترميز البروتينات وضبط نموذج "لغة" بروتين مُدرَّب مسبقًا كبير بدقة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_language_modeling-tf.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/protein_language_modeling-tf.ipynb) | + +#### دفاتر ملاحظات الأدوات المساعدة [[tensorflow-utility]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [كيفية تدريب نماذج TF/Keras على TPU](https://github.com/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) | شاهد كيفية التدريب بسرعة عالية على أجهزة TPU من Google | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) | + +### دفاتر ملاحظات Optimum + +🤗 [Optimum](https://github.com/huggingface/optimum) هو امتداد لـ 🤗 Transformers، يوفر مجموعة من أدوات تحسين الأداء التي تمكن من تحقيق أقصى قدر من الكفاءة لتدريب وتشغيل النماذج على الأجهزة المستهدفة. + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [كيفية تكميم نموذج باستخدام ONNX Runtime لتصنيف النص](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)| يوضح كيفية تطبيق التكميم الثابت والديناميكي على نموذج باستخدام [ONNX Runtime](https://github.com/microsoft/onnxruntime) لأي مهمة GLUE. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)| +| [كيفية تكميم نموذج باستخدام Intel Neural Compressor لتصنيف النص](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_quantization_inc.ipynb)| يوضح كيفية تطبيق التكميم الثابت والديناميكي والتدريبي على نموذج باستخدام [Intel Neural Compressor (INC)](https://github.com/intel/neural-compressor) لأي مهمة GLUE. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_inc.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_inc.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف النص باستخدام ONNX Runtime](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج بدقة على أي مهمة GLUE باستخدام [ONNX Runtime](https://github.com/microsoft/onnxruntime). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)| +| [كيفية ضبط نموذج بدقة على التلخيص باستخدام ONNX Runtime](https://github.com/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج بدقة على XSUM باستخدام [ONNX Runtime](https://github.com/microsoft/onnxruntime). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)| + + +## دفاتر ملاحظات المجتمع: + +تتوفر المزيد من دفاتر الملاحظات التي طورها المجتمع [هنا](https://hf.co/docs/transformers/community#community-notebooks). + diff --git a/docs/source/de/installation.md b/docs/source/de/installation.md index 1bd34f73302b27..44b6f1ed981e1e 100644 --- a/docs/source/de/installation.md +++ b/docs/source/de/installation.md @@ -149,7 +149,7 @@ conda install conda-forge::transformers Vorgefertigte Modelle werden heruntergeladen und lokal zwischengespeichert unter: `~/.cache/huggingface/hub`. Dies ist das Standardverzeichnis, das durch die Shell-Umgebungsvariable "TRANSFORMERS_CACHE" vorgegeben ist. Unter Windows wird das Standardverzeichnis durch `C:\Benutzer\Benutzername\.cache\huggingface\hub` angegeben. Sie können die unten aufgeführten Shell-Umgebungsvariablen - in der Reihenfolge ihrer Priorität - ändern, um ein anderes Cache-Verzeichnis anzugeben: -1. Shell-Umgebungsvariable (Standard): `HUGGINGFACE_HUB_CACHE` oder `TRANSFORMERS_CACHE`. +1. Shell-Umgebungsvariable (Standard): `HF_HUB_CACHE` oder `TRANSFORMERS_CACHE`. 2. Shell-Umgebungsvariable: `HF_HOME`. 3. Shell-Umgebungsvariable: `XDG_CACHE_HOME` + `/huggingface`. diff --git a/docs/source/en/_config.py b/docs/source/en/_config.py index 4381def017ddc5..f49e4e4731965a 100644 --- a/docs/source/en/_config.py +++ b/docs/source/en/_config.py @@ -11,4 +11,4 @@ "{processor_class}": "FakeProcessorClass", "{model_class}": "FakeModelClass", "{object_class}": "FakeObjectClass", -} \ No newline at end of file +} diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index d800e40ecbd69d..6e325e499f342d 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -218,6 +218,8 @@ title: CPU inference - local: perf_infer_gpu_one title: GPU inference + - local: perf_infer_gpu_multi + title: Multi-GPU inference title: Optimizing inference - local: big_models title: Instantiate a big model @@ -514,8 +516,8 @@ title: Nyströmformer - local: model_doc/olmo title: OLMo - - local: model_doc/olmo_1124 - title: OLMo November 2024 + - local: model_doc/olmo2 + title: OLMo2 - local: model_doc/olmoe title: OLMoE - local: model_doc/open-llama @@ -655,6 +657,8 @@ title: GLPN - local: model_doc/hiera title: Hiera + - local: model_doc/ijepa + title: I-JEPA - local: model_doc/imagegpt title: ImageGPT - local: model_doc/levit @@ -806,6 +810,8 @@ title: ALIGN - local: model_doc/altclip title: AltCLIP + - local: model_doc/aria + title: Aria - local: model_doc/blip title: BLIP - local: model_doc/blip-2 diff --git a/docs/source/en/agents.md b/docs/source/en/agents.md index 721e348f89fe22..56c9184980f4b2 100644 --- a/docs/source/en/agents.md +++ b/docs/source/en/agents.md @@ -225,7 +225,7 @@ You have access to the following tools: To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences. At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task, then the tools that you want to use. -Then in the 'Code:' sequence, you shold write the code in simple Python. The code sequence must end with '/End code' sequence. +Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '/End code' sequence. During each intermediate step, you can use 'print()' to save whatever important information you will then need. These print outputs will then be available in the 'Observation:' field, for using this information as input for the next step. diff --git a/docs/source/en/agents_advanced.md b/docs/source/en/agents_advanced.md index e80e402d7374cd..c4753bf1366b09 100644 --- a/docs/source/en/agents_advanced.md +++ b/docs/source/en/agents_advanced.md @@ -211,7 +211,7 @@ agent.run("How many more blocks (also denoted as layers) are in BERT base encode ## Display your agent run in a cool Gradio interface -You can leverage `gradio.Chatbot`to display your agent's thoughts using `stream_to_gradio`, here is an example: +You can leverage `gradio.Chatbot` to display your agent's thoughts using `stream_to_gradio`, here is an example: ```py import gradio as gr diff --git a/docs/source/en/autoclass_tutorial.md b/docs/source/en/autoclass_tutorial.md index 0f02f19ed29534..33f48b2b043fec 100644 --- a/docs/source/en/autoclass_tutorial.md +++ b/docs/source/en/autoclass_tutorial.md @@ -138,12 +138,15 @@ Load a processor with [`AutoProcessor.from_pretrained`]: -The `AutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`AutoModelForSequenceClassification.from_pretrained`]: +The `AutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`AutoModelForSequenceClassification.from_pretrained`]. + +> [!WARNING] +> By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type. ```py >>> from transformers import AutoModelForSequenceClassification ->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") +>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype="auto") ``` Easily reuse the same checkpoint to load an architecture for a different task: @@ -151,7 +154,7 @@ Easily reuse the same checkpoint to load an architecture for a different task: ```py >>> from transformers import AutoModelForTokenClassification ->>> model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased") +>>> model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype="auto") ``` diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md index 621edeb20e8ea3..47032a2a292b1b 100644 --- a/docs/source/en/generation_strategies.md +++ b/docs/source/en/generation_strategies.md @@ -416,16 +416,6 @@ Assisted decoding assumes the main and assistant models have the same tokenizer, Currently, only greedy search and sampling are supported with assisted decoding, and assisted decoding doesn't support batched inputs. To learn more about assisted decoding, check [this blog post](https://huggingface.co/blog/assisted-generation). -#### Universal Assisted Decoding - -Universal Assisted Decoding (UAD) adds support for main and assistant models with different tokenizers. -To use it, simply pass the tokenizers using the `tokenizer` and `assistant_tokenizer` arguments (see below). -Internally, the main model input tokens are re-encoded into assistant model tokens, then candidate tokens are generated in the assistant encoding, which are -in turn re-encoded into main model candidate tokens. Validation then proceeds as explained above. -The re-encoding steps involve decoding token ids into text and then encoding the text using a different tokenizer. -Since re-encoding the tokens may result in tokenization discrepancies, UAD finds the longest common subsequence between the source and target encodings, -to ensure the new tokens include the correct prompt suffix. - To enable assisted decoding, set the `assistant_model` argument with a model. ```python @@ -445,7 +435,38 @@ To enable assisted decoding, set the `assistant_model` argument with a model. ['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a'] ``` -If the main and assistant models have different tokenizers, use Universal Assisted Decoding. +When using assisted decoding with sampling methods, you can use the `temperature` argument to control the randomness, +just like in multinomial sampling. However, in assisted decoding, reducing the temperature may help improve the latency. + +```python +>>> from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed +>>> set_seed(42) # For reproducibility + +>>> prompt = "Alice and Bob" +>>> checkpoint = "EleutherAI/pythia-1.4b-deduped" +>>> assistant_checkpoint = "EleutherAI/pythia-160m-deduped" + +>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) +>>> inputs = tokenizer(prompt, return_tensors="pt") + +>>> model = AutoModelForCausalLM.from_pretrained(checkpoint) +>>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint) +>>> outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5) +>>> tokenizer.batch_decode(outputs, skip_special_tokens=True) +['Alice and Bob, a couple of friends of mine, who are both in the same office as'] +``` + +We recommend to install `scikit-learn` library to enhance the candidate generation strategy and achieve additional speedup. + +#### Universal Assisted Decoding + +Universal Assisted Decoding (UAD) adds support for main and assistant models with different tokenizers. +To use it, simply pass the tokenizers using the `tokenizer` and `assistant_tokenizer` arguments (see below). +Internally, the main model input tokens are re-encoded into assistant model tokens, then candidate tokens are generated in the assistant encoding, which are +in turn re-encoded into main model candidate tokens. Validation then proceeds as explained above. +The re-encoding steps involve decoding token ids into text and then encoding the text using a different tokenizer. +Since re-encoding the tokens may result in tokenization discrepancies, UAD finds the longest common subsequence between the source and target encodings, +to ensure the new tokens include the correct prompt suffix. ```python >>> from transformers import AutoModelForCausalLM, AutoTokenizer @@ -465,30 +486,35 @@ If the main and assistant models have different tokenizers, use Universal Assist ['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a'] ``` -When using assisted decoding with sampling methods, you can use the `temperature` argument to control the randomness, -just like in multinomial sampling. However, in assisted decoding, reducing the temperature may help improve the latency. +#### Prompt Lookup + +Alternatively, you can also set the `prompt_lookup_num_tokens` to trigger n-gram based assisted decoding, as opposed +to model based assisted decoding. You can read more about it [here](https://twitter.com/joao_gante/status/1747322413006643259). + +#### Self-Speculative Decoding + +An LLM can be trained to also use its language modeling head with earlier hidden states as input, effectively +skipping layers to yield a lower-quality output -- a technique called early exiting. +We use the lower-quality early exit output as an assistant output, and apply self-speculation to fix the output using the remaining layers. The final generation of that self-speculative solution is the same (or has the same distribution) as the original model's generation. +If the model you're using was trained to do early exit, you can pass +`assistant_early_exit` (integer). In this case, the assistant model will be the same model but exiting early, hence the +"self-speculative" name. Because the assistant model is a portion of the target model, caches and weights can be shared, which results in lower memory requirements. As in other assisted generation methods, the final generated result has the same quality as if no assistant had been used. ```python ->>> from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed ->>> set_seed(42) # For reproducibility +>>> from transformers import AutoModelForCausalLM, AutoTokenizer >>> prompt = "Alice and Bob" ->>> checkpoint = "EleutherAI/pythia-1.4b-deduped" ->>> assistant_checkpoint = "EleutherAI/pythia-160m-deduped" +>>> checkpoint = "facebook/layerskip-llama3.2-1B" >>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) >>> inputs = tokenizer(prompt, return_tensors="pt") >>> model = AutoModelForCausalLM.from_pretrained(checkpoint) ->>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint) ->>> outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5) +>>> outputs = model.generate(**inputs, assistant_early_exit=4, do_sample=False, max_new_tokens=20) >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) -['Alice and Bob, a couple of friends of mine, who are both in the same office as'] +['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a'] ``` -Alternatively, you can also set the `prompt_lookup_num_tokens` to trigger n-gram based assisted decoding, as opposed -to model based assisted decoding. You can read more about it [here](https://twitter.com/joao_gante/status/1747322413006643259). - ### DoLa Decoding **D**ecoding by C**o**ntrasting **La**yers (DoLa) is a contrastive decoding strategy to improve the factuality and reduce the diff --git a/docs/source/en/gguf.md b/docs/source/en/gguf.md index 2da721b28986af..b1ed1f0d492ab9 100644 --- a/docs/source/en/gguf.md +++ b/docs/source/en/gguf.md @@ -87,6 +87,7 @@ For now the supported model architectures are the architectures that have been v - Starcoder2 - T5 - Mamba +- Nemotron ## Example usage diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 341cb417c7b8ac..181ec8b10fb1fc 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -62,6 +62,8 @@ Flax), PyTorch, and/or TensorFlow. | [ALBERT](model_doc/albert) | ✅ | ✅ | ✅ | | [ALIGN](model_doc/align) | ✅ | ❌ | ❌ | | [AltCLIP](model_doc/altclip) | ✅ | ❌ | ❌ | +| [Aria](model_doc/aria) | ✅ | ❌ | ❌ | +| [AriaText](model_doc/aria_text) | ✅ | ❌ | ❌ | | [Audio Spectrogram Transformer](model_doc/audio-spectrogram-transformer) | ✅ | ❌ | ❌ | | [Autoformer](model_doc/autoformer) | ✅ | ❌ | ❌ | | [Bark](model_doc/bark) | ✅ | ❌ | ❌ | @@ -168,9 +170,11 @@ Flax), PyTorch, and/or TensorFlow. | [Hiera](model_doc/hiera) | ✅ | ❌ | ❌ | | [Hubert](model_doc/hubert) | ✅ | ✅ | ❌ | | [I-BERT](model_doc/ibert) | ✅ | ❌ | ❌ | +| [I-JEPA](model_doc/ijepa) | ✅ | ❌ | ❌ | | [IDEFICS](model_doc/idefics) | ✅ | ✅ | ❌ | | [Idefics2](model_doc/idefics2) | ✅ | ❌ | ❌ | | [Idefics3](model_doc/idefics3) | ✅ | ❌ | ❌ | +| [Idefics3VisionTransformer](model_doc/idefics3_vision) | ❌ | ❌ | ❌ | | [ImageGPT](model_doc/imagegpt) | ✅ | ❌ | ❌ | | [Informer](model_doc/informer) | ✅ | ❌ | ❌ | | [InstructBLIP](model_doc/instructblip) | ✅ | ❌ | ❌ | @@ -240,7 +244,7 @@ Flax), PyTorch, and/or TensorFlow. | [Nougat](model_doc/nougat) | ✅ | ✅ | ✅ | | [Nyströmformer](model_doc/nystromformer) | ✅ | ❌ | ❌ | | [OLMo](model_doc/olmo) | ✅ | ❌ | ❌ | -| [OLMo November 2024](model_doc/olmo_1124) | ✅ | ❌ | ❌ | +| [OLMo2](model_doc/olmo2) | ✅ | ❌ | ❌ | | [OLMoE](model_doc/olmoe) | ✅ | ❌ | ❌ | | [OmDet-Turbo](model_doc/omdet-turbo) | ✅ | ❌ | ❌ | | [OneFormer](model_doc/oneformer) | ✅ | ❌ | ❌ | diff --git a/docs/source/en/installation.md b/docs/source/en/installation.md index f4ce768c3168e9..af7c97ef3508ae 100644 --- a/docs/source/en/installation.md +++ b/docs/source/en/installation.md @@ -157,7 +157,7 @@ conda install conda-forge::transformers Pretrained models are downloaded and locally cached at: `~/.cache/huggingface/hub`. This is the default directory given by the shell environment variable `TRANSFORMERS_CACHE`. On Windows, the default directory is given by `C:\Users\username\.cache\huggingface\hub`. You can change the shell environment variables shown below - in order of priority - to specify a different cache directory: -1. Shell environment variable (default): `HUGGINGFACE_HUB_CACHE` or `TRANSFORMERS_CACHE`. +1. Shell environment variable (default): `HF_HUB_CACHE` or `TRANSFORMERS_CACHE`. 2. Shell environment variable: `HF_HOME`. 3. Shell environment variable: `XDG_CACHE_HOME` + `/huggingface`. diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md index eb25ddb6329755..a54ac432006a84 100644 --- a/docs/source/en/internal/generation_utils.md +++ b/docs/source/en/internal/generation_utils.md @@ -436,3 +436,9 @@ A [`Constraint`] can be used to force the generation to include specific tokens [[autodoc]] SynthIDTextWatermarkDetector - __call__ + +## Compile Utils + +[[autodoc]] CompileConfig + - __call__ + diff --git a/docs/source/en/kv_cache.md b/docs/source/en/kv_cache.md index 05ab9eafa72349..b1d1e0998f06ed 100644 --- a/docs/source/en/kv_cache.md +++ b/docs/source/en/kv_cache.md @@ -180,7 +180,7 @@ Fun fact: The shortest war in history was between Britain and Zanzibar on August -Cache offloading requires a GPU and can be slower than dynamic KV cache. Use it if you are getting CUDA out of memory errors. +Cache offloading requires a CUDA GPU and can be slower than dynamic KV cache. Use it if you are getting CUDA out of memory errors. @@ -261,6 +261,7 @@ This will use the [`~OffloadedStaticCache`] implementation instead. >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0] "Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of" ``` +Cache offloading requires a CUDA GPU. ### Sliding Window Cache diff --git a/docs/source/en/llm_optims.md b/docs/source/en/llm_optims.md index 0a6a7e15bea081..e97ace8a625050 100644 --- a/docs/source/en/llm_optims.md +++ b/docs/source/en/llm_optims.md @@ -57,13 +57,13 @@ import os os.environ["TOKENIZERS_PARALLELISM"] = "false" # To prevent long warnings :) tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b") -model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto") +model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", torch_dtype="auto", device_map="auto") model.generation_config.cache_implementation = "static" model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True) input_text = "The theory of special relativity states " -input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") +input_ids = tokenizer(input_text, return_tensors="pt").to(model.device.type) outputs = model.generate(**input_ids) print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) @@ -89,11 +89,11 @@ import os os.environ["TOKENIZERS_PARALLELISM"] = "false" # To prevent long warnings :) tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b") -model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto") +model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", torch_dtype="auto", device_map="auto") model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True) input_text = "The theory of special relativity states " -input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") +input_ids = tokenizer(input_text, return_tensors="pt").to(model.device.type) prompt_length = input_ids.input_ids.shape[1] model.generation_config.max_new_tokens = 16 @@ -126,6 +126,7 @@ If you want to go further down a level, the [`StaticCache`] object can also be p from transformers import LlamaTokenizer, LlamaForCausalLM, StaticCache, logging from transformers.testing_utils import CaptureLogger import torch +from accelerate.test_utils.testing import get_backend prompts = [ "Simply put, the theory of relativity states that ", @@ -133,7 +134,7 @@ prompts = [ ] NUM_TOKENS_TO_GENERATE = 40 -torch_device = "cuda" +torch_device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", pad_token="", padding_side="right") model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map="sequential") @@ -201,11 +202,11 @@ import os os.environ["TOKENIZERS_PARALLELISM"] = "false" # To prevent long warnings :) tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b") -model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto") +model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", torch_dtype="auto", device_map="auto") model.generate = torch.compile(model.generate, mode="reduce-overhead", fullgraph=True) input_text = "The theory of special relativity states " -input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") +input_ids = tokenizer(input_text, return_tensors="pt").to(model.device.type) outputs = model.generate(**input_ids) print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) @@ -241,13 +242,14 @@ Enable speculative decoding by loading an assistant model and passing it to the ```py from transformers import AutoModelForCausalLM, AutoTokenizer import torch +from accelerate.test_utils.testing import get_backend -device = "cuda" if torch.cuda.is_available() else "cpu" +device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b") inputs = tokenizer("Einstein's theory of relativity states", return_tensors="pt").to(device) -model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b").to(device) +model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", torch_dtype="auto").to(device) assistant_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m").to(device) outputs = model.generate(**inputs, assistant_model=assistant_model) tokenizer.batch_decode(outputs, skip_special_tokens=True) @@ -262,13 +264,14 @@ For speculative sampling decoding, add the `do_sample` and `temperature` paramet ```py from transformers import AutoModelForCausalLM, AutoTokenizer import torch +from accelerate.test_utils.testing import get_backend -device = "cuda" if torch.cuda.is_available() else "cpu" +device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b") inputs = tokenizer("Einstein's theory of relativity states", return_tensors="pt").to(device) -model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b").to(device) +model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", torch_dtype="auto").to(device) assistant_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m").to(device) outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.7) print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) @@ -290,13 +293,14 @@ To enable prompt lookup decoding, specify the number of tokens that should be ov ```py from transformers import AutoModelForCausalLM, AutoTokenizer import torch +from accelerate.test_utils.testing import get_backend -device = "cuda" if torch.cuda.is_available() else "cpu" +device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b") inputs = tokenizer("The second law of thermodynamics states", return_tensors="pt").to(device) -model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b").to(device) +model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", torch_dtype="auto").to(device) assistant_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m").to(device) outputs = model.generate(**inputs, prompt_lookup_num_tokens=3) print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) @@ -311,13 +315,14 @@ For prompt lookup decoding with sampling, add the `do_sample` and `temperature` ```py from transformers import AutoModelForCausalLM, AutoTokenizer import torch +from accelerate.test_utils.testing import get_backend -device = "cuda" if torch.cuda.is_available() else "cpu" +device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b") inputs = tokenizer("The second law of thermodynamics states", return_tensors="pt").to(device) -model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b").to(device) +model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", torch_dtype="auto").to(device) outputs = model.generate(**inputs, prompt_lookup_num_tokens=3, do_sample=True, temperature=0.7) print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) ["The second law of thermodynamics states that energy cannot be created nor destroyed. It's not a"] diff --git a/docs/source/en/llm_tutorial_optimization.md b/docs/source/en/llm_tutorial_optimization.md index 9d3d8ad6ba8b86..3414725fc37087 100644 --- a/docs/source/en/llm_tutorial_optimization.md +++ b/docs/source/en/llm_tutorial_optimization.md @@ -147,7 +147,7 @@ Let's call it now for the next experiment. ```python flush() ``` -In the recent version of the accelerate library, you can also use a utility method called `release_memory()` +From the Accelerate library, you can also use a device-agnostic utility method called [release_memory](https://github.com/huggingface/accelerate/blob/29be4788629b772a3b722076e433b5b3b5c85da3/src/accelerate/utils/memory.py#L63), which takes various hardware backends like XPU, MLU, NPU, MPS, and more into account. ```python from accelerate.utils import release_memory diff --git a/docs/source/en/model_doc/aria.md b/docs/source/en/model_doc/aria.md new file mode 100644 index 00000000000000..9ff7a6687aa939 --- /dev/null +++ b/docs/source/en/model_doc/aria.md @@ -0,0 +1,106 @@ + + +# Aria + +## Overview + +The Aria model was proposed in [Aria: An Open Multimodal Native Mixture-of-Experts Model](https://huggingface.co/papers/2410.05993) by Li et al. from the Rhymes.AI team. + +Aria is an open multimodal-native model with best-in-class performance across a wide range of multimodal, language, and coding tasks. It has a Mixture-of-Experts architecture, with respectively 3.9B and 3.5B activated parameters per visual token and text token. + +The abstract from the paper is the following: + +*Information comes in diverse modalities. Multimodal native AI models are essential to integrate real-world information and deliver comprehensive understanding. While proprietary multimodal native models exist, their lack of openness imposes obstacles for adoptions, let alone adaptations. To fill this gap, we introduce Aria, an open multimodal native model with best-in-class performance across a wide range of multimodal, language, and coding tasks. Aria is a mixture-of-expert model with 3.9B and 3.5B activated parameters per visual token and text token, respectively. It outperforms Pixtral-12B and Llama3.2-11B, and is competitive against the best proprietary models on various multimodal tasks. We pre-train Aria from scratch following a 4-stage pipeline, which progressively equips the model with strong capabilities in language understanding, multimodal understanding, long context window, and instruction following. We open-source the model weights along with a codebase that facilitates easy adoptions and adaptations of Aria in real-world applications.* + +This model was contributed by [m-ric](https://huggingface.co/m-ric). +The original code can be found [here](https://github.com/rhymes-ai/Aria). + +## Usage tips + +Here's how to use the model for vision tasks: +```python +import requests +import torch +from PIL import Image + +from transformers import AriaProcessor, AriaForConditionalGeneration + +model_id_or_path = "rhymes-ai/Aria" + +model = AriaForConditionalGeneration.from_pretrained( + model_id_or_path, device_map="auto" +) + +processor = AriaProcessor.from_pretrained(model_id_or_path) + +image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw) + +messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"text": "what is the image?", "type": "text"}, + ], + } +] + +text = processor.apply_chat_template(messages, add_generation_prompt=True) +inputs = processor(text=text, images=image, return_tensors="pt") +inputs.to(model.device) + +output = model.generate( + **inputs, + max_new_tokens=15, + stop_strings=["<|im_end|>"], + tokenizer=processor.tokenizer, + do_sample=True, + temperature=0.9, +) +output_ids = output[0][inputs["input_ids"].shape[1]:] +response = processor.decode(output_ids, skip_special_tokens=True) +``` + + +## AriaImageProcessor + +[[autodoc]] AriaImageProcessor + +## AriaProcessor + +[[autodoc]] AriaProcessor + +## AriaTextConfig + +[[autodoc]] AriaTextConfig + +## AriaConfig + +[[autodoc]] AriaConfig + +## AriaTextModel + +[[autodoc]] AriaTextModel + +## AriaTextForCausalLM + +[[autodoc]] AriaTextForCausalLM + +## AriaForConditionalGeneration + +[[autodoc]] AriaForConditionalGeneration + - forward diff --git a/docs/source/en/model_doc/blip-2.md b/docs/source/en/model_doc/blip-2.md index b57c69ca6b321b..4125d372d55ad5 100644 --- a/docs/source/en/model_doc/blip-2.md +++ b/docs/source/en/model_doc/blip-2.md @@ -40,6 +40,10 @@ The original code can be found [here](https://github.com/salesforce/LAVIS/tree/5 - BLIP-2 can be used for conditional text generation given an image and an optional text prompt. At inference time, it's recommended to use the [`generate`] method. - One can use [`Blip2Processor`] to prepare images for the model, and decode the predicted tokens ID's back to text. +> [!NOTE] +> BLIP models after release v4.46 will raise warnings about adding `processor.num_query_tokens = {{num_query_tokens}}` and expand model embeddings layer to add special `` token. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. Adding these attributes means that BLIP will add the number of query tokens required per image and expand the text with as many `` placeholders as there will be query tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there wil be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.num_query_tokens` and model embeddings expansion can be done by following [this link](https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042). + ## Resources A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BLIP-2. diff --git a/docs/source/en/model_doc/deformable_detr.md b/docs/source/en/model_doc/deformable_detr.md index 82ef251d478b95..5ed99dfe81d1c0 100644 --- a/docs/source/en/model_doc/deformable_detr.md +++ b/docs/source/en/model_doc/deformable_detr.md @@ -54,6 +54,12 @@ If you're interested in submitting a resource to be included here, please feel f - preprocess - post_process_object_detection +## DeformableDetrImageProcessorFast + +[[autodoc]] DeformableDetrImageProcessorFast + - preprocess + - post_process_object_detection + ## DeformableDetrFeatureExtractor [[autodoc]] DeformableDetrFeatureExtractor diff --git a/docs/source/en/model_doc/idefics3.md b/docs/source/en/model_doc/idefics3.md index dfaf40477a7b52..cf7c043e928901 100644 --- a/docs/source/en/model_doc/idefics3.md +++ b/docs/source/en/model_doc/idefics3.md @@ -51,6 +51,13 @@ This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) [[autodoc]] Idefics3Config +## Idefics3VisionConfig + +[[autodoc]] Idefics3VisionConfig + +## Idefics3VisionTransformer + +[[autodoc]] Idefics3VisionTransformer ## Idefics3Model diff --git a/docs/source/en/model_doc/ijepa.md b/docs/source/en/model_doc/ijepa.md new file mode 100644 index 00000000000000..cb2afd25e20bca --- /dev/null +++ b/docs/source/en/model_doc/ijepa.md @@ -0,0 +1,92 @@ + + +# I-JEPA + +## Overview + +The I-JEPA model was proposed in [Image-based Joint-Embedding Predictive Architecture](https://arxiv.org/abs/2301.08243) by Mahmoud Assran, Quentin Duval, Ishan Misra, Piotr Bojanowski, Pascal Vincent, Michael Rabbat, Yann LeCun, Nicolas Ballas. +I-JEPA is a self-supervised learning method that predicts the representations of one part of an image based on other parts of the same image. This approach focuses on learning semantic features without relying on pre-defined invariances from hand-crafted data transformations, which can bias specific tasks, or on filling in pixel-level details, which often leads to less meaningful representations. + +The abstract from the paper is the following: + +This paper demonstrates an approach for learning highly semantic image representations without relying on hand-crafted data-augmentations. We introduce the Image- based Joint-Embedding Predictive Architecture (I-JEPA), a non-generative approach for self-supervised learning from images. The idea behind I-JEPA is simple: from a single context block, predict the representations of various target blocks in the same image. A core design choice to guide I-JEPA towards producing semantic representations is the masking strategy; specifically, it is crucial to (a) sample tar- get blocks with sufficiently large scale (semantic), and to (b) use a sufficiently informative (spatially distributed) context block. Empirically, when combined with Vision Transform- ers, we find I-JEPA to be highly scalable. For instance, we train a ViT-Huge/14 on ImageNet using 16 A100 GPUs in under 72 hours to achieve strong downstream performance across a wide range of tasks, from linear classification to object counting and depth prediction. + + + + I-JEPA architecture. Taken from the original paper. + +This model was contributed by [jmtzt](https://huggingface.co/jmtzt). +The original code can be found [here](https://github.com/facebookresearch/ijepa). + +## How to use + +Here is how to use this model for image feature extraction: + +```python +import requests +import torch +from PIL import Image +from torch.nn.functional import cosine_similarity + +from transformers import AutoModel, AutoProcessor + +url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg" +url_2 = "http://images.cocodataset.org/val2017/000000219578.jpg" +image_1 = Image.open(requests.get(url_1, stream=True).raw) +image_2 = Image.open(requests.get(url_2, stream=True).raw) + +model_id = "facebook/ijepa_vith14_1k" +processor = AutoProcessor.from_pretrained(model_id) +model = AutoModel.from_pretrained(model_id) + +@torch.no_grad() +def infer(image): + inputs = processor(image, return_tensors="pt") + outputs = model(**inputs) + return outputs.last_hidden_state.mean(dim=1) + + +embed_1 = infer(image_1) +embed_2 = infer(image_2) + +similarity = cosine_similarity(embed_1, embed_2) +print(similarity) +``` + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with I-JEPA. + + + +- [`IJepaForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb). +- See also: [Image classification task guide](../tasks/image_classification) + +## IJepaConfig + +[[autodoc]] IJepaConfig + +## IJepaModel + +[[autodoc]] IJepaModel + - forward + +## IJepaForImageClassification + +[[autodoc]] IJepaForImageClassification + - forward \ No newline at end of file diff --git a/docs/source/en/model_doc/instructblip.md b/docs/source/en/model_doc/instructblip.md index b5fc634b621626..904a96bc786f07 100644 --- a/docs/source/en/model_doc/instructblip.md +++ b/docs/source/en/model_doc/instructblip.md @@ -33,6 +33,10 @@ The original code can be found [here](https://github.com/salesforce/LAVIS/tree/m InstructBLIP uses the same architecture as [BLIP-2](blip2) with a tiny but important difference: it also feeds the text prompt (instruction) to the Q-Former. +> [!NOTE] +> BLIP models after release v4.46 will raise warnings about adding `processor.num_query_tokens = {{num_query_tokens}}` and expand model embeddings layer to add special `` token. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. Adding these attributes means that BLIP will add the number of query tokens required per image and expand the text with as many `` placeholders as there will be query tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there wil be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.num_query_tokens` and model embeddings expansion can be done by following [this link](https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042). + ## InstructBlipConfig [[autodoc]] InstructBlipConfig diff --git a/docs/source/en/model_doc/instructblipvideo.md b/docs/source/en/model_doc/instructblipvideo.md index aa93feb6b6dced..8b2207ce176566 100644 --- a/docs/source/en/model_doc/instructblipvideo.md +++ b/docs/source/en/model_doc/instructblipvideo.md @@ -35,6 +35,10 @@ The original code can be found [here](https://github.com/salesforce/LAVIS/tree/m - The model was trained by sampling 4 frames per video, so it's recommended to sample 4 frames +> [!NOTE] +> BLIP models after release v4.46 will raise warnings about adding `processor.num_query_tokens = {{num_query_tokens}}` and expand model embeddings layer to add special `` token. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. Adding these attributes means that BLIP will add the number of query tokens required per image and expand the text with as many `` placeholders as there will be query tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there wil be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.num_query_tokens` and model embeddings expansion can be done by following [this link](https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042). + ## InstructBlipVideoConfig [[autodoc]] InstructBlipVideoConfig diff --git a/docs/source/en/model_doc/llava.md b/docs/source/en/model_doc/llava.md index 7f326bd0c006db..dec19ca5ef45db 100644 --- a/docs/source/en/model_doc/llava.md +++ b/docs/source/en/model_doc/llava.md @@ -40,6 +40,13 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/ - Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results. + +> [!NOTE] +> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. +Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches. + + ### Single image inference For best results, we recommend users to use the processor's `apply_chat_template()` method to format your prompt correctly. For that you need to construct a conversation history, passing in a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities, as follows: diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md index b9146fbd33478a..88bd63e7101f17 100644 --- a/docs/source/en/model_doc/llava_next.md +++ b/docs/source/en/model_doc/llava_next.md @@ -53,6 +53,12 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/ +> [!NOTE] +> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. +Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches. + + - Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use the processor's `apply_chat_template` to format your prompts correctly. For that you have to construct a conversation history, passing a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities. Below is an example of how to do that and the list of formats accepted by each checkpoint. We will use [llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) and a conversation history of text and image. Each content field has to be a list of dicts, as follows: diff --git a/docs/source/en/model_doc/llava_next_video.md b/docs/source/en/model_doc/llava_next_video.md index fe905dfb7932ab..f8a149f12b6779 100644 --- a/docs/source/en/model_doc/llava_next_video.md +++ b/docs/source/en/model_doc/llava_next_video.md @@ -50,6 +50,12 @@ The original code can be found [here](https://github.com/LLaVA-VL/LLaVA-NeXT/tre +> [!NOTE] +> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. +Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches. + + - Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use tokenizer's `apply_chat_template` to format your prompts correctly. Below is an example of how to do that. We will use [LLaVA-NeXT-Video-7B-hf](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf) and a conversation history of videos and images. Each content field has to be a list of dicts, as follows: diff --git a/docs/source/en/model_doc/olmo_1124.md b/docs/source/en/model_doc/olmo2.md similarity index 84% rename from docs/source/en/model_doc/olmo_1124.md rename to docs/source/en/model_doc/olmo2.md index f36ec438e57a84..8ca3326660b3f4 100644 --- a/docs/source/en/model_doc/olmo_1124.md +++ b/docs/source/en/model_doc/olmo2.md @@ -14,11 +14,11 @@ rendered properly in your Markdown viewer. --> -# OLMo November 2024 +# OLMo2 ## Overview -The OLMo November 2024 model is a successor of the OLMo model, which was proposed in +The OLMo2 model is the successor of the OLMo model, which was proposed in [OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838). The architectural changes from the original OLMo model to this model are: @@ -31,16 +31,16 @@ This model was contributed by [shanearora](https://huggingface.co/shanearora). The original code can be found [here](https://github.com/allenai/OLMo/tree/main/olmo). -## Olmo1124Config +## Olmo2Config -[[autodoc]] Olmo1124Config +[[autodoc]] Olmo2Config -## Olmo1124Model +## Olmo2Model -[[autodoc]] Olmo1124Model +[[autodoc]] Olmo2Model - forward -## Olmo1124ForCausalLM +## Olmo2ForCausalLM -[[autodoc]] Olmo1124ForCausalLM +[[autodoc]] Olmo2ForCausalLM - forward diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md index ab604e4521fc73..62bdc004c51718 100644 --- a/docs/source/en/model_doc/pixtral.md +++ b/docs/source/en/model_doc/pixtral.md @@ -88,6 +88,11 @@ output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up [[autodoc]] PixtralImageProcessor - preprocess +## PixtralImageProcessorFast + +[[autodoc]] PixtralImageProcessorFast + - preprocess + ## PixtralProcessor [[autodoc]] PixtralProcessor diff --git a/docs/source/en/model_doc/rt_detr.md b/docs/source/en/model_doc/rt_detr.md index 8ad220dc4bd113..6a1545e123297c 100644 --- a/docs/source/en/model_doc/rt_detr.md +++ b/docs/source/en/model_doc/rt_detr.md @@ -57,7 +57,7 @@ Initially, an image is processed using a pre-trained convolutional neural networ >>> with torch.no_grad(): ... outputs = model(**inputs) ->>> results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.3) +>>> results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3) >>> for result in results: ... for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]): diff --git a/docs/source/en/model_doc/video_llava.md b/docs/source/en/model_doc/video_llava.md index 1c4b5b4b874dd7..105307196effd0 100644 --- a/docs/source/en/model_doc/video_llava.md +++ b/docs/source/en/model_doc/video_llava.md @@ -54,6 +54,12 @@ This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanT The original code can be found [here](https://github.com/PKU-YuanGroup/Video-LLaVA). +> [!NOTE] +> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. +Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches. + + ## Usage example ### Single Media Mode diff --git a/docs/source/en/model_doc/vipllava.md b/docs/source/en/model_doc/vipllava.md index b3e76cd292e40a..cb625e3711615b 100644 --- a/docs/source/en/model_doc/vipllava.md +++ b/docs/source/en/model_doc/vipllava.md @@ -39,6 +39,12 @@ This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) - Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results. +> [!NOTE] +> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. +Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches. + + - For better results, we recommend users to use the processor's `apply_chat_template()` method to format your prompt correctly. For that you need to construct a conversation history, passing in a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities, as follows: ```python @@ -52,7 +58,7 @@ conversation = [ "content": [ {"type": "image"}, {"type": "text", "text": "What’s shown in this image?"}, - , + ], }, { "role": "assistant", diff --git a/docs/source/en/perf_infer_cpu.md b/docs/source/en/perf_infer_cpu.md index c0e017c020870e..7f8b525b3df610 100644 --- a/docs/source/en/perf_infer_cpu.md +++ b/docs/source/en/perf_infer_cpu.md @@ -41,8 +41,7 @@ Enable BetterTransformer with the [`PreTrainedModel.to_bettertransformer`] metho ```py from transformers import AutoModelForCausalLM -model = AutoModelForCausalLM.from_pretrained("bigcode/starcoder") -model.to_bettertransformer() +model = AutoModelForCausalLM.from_pretrained("bigcode/starcoder", torch_dtype="auto") ``` ## TorchScript @@ -54,7 +53,7 @@ For a gentle introduction to TorchScript, see the [Introduction to PyTorch Torch With the [`Trainer`] class, you can enable JIT mode for CPU inference by setting the `--jit_mode_eval` flag: ```bash -python run_qa.py \ +python examples/pytorch/question-answering/run_qa.py \ --model_name_or_path csarron/bert-base-uncased-squad-v1 \ --dataset_name squad \ --do_eval \ @@ -86,7 +85,7 @@ pip install intel_extension_for_pytorch Set the `--use_ipex` and `--jit_mode_eval` flags in the [`Trainer`] class to enable JIT mode with the graph optimizations: ```bash -python run_qa.py \ +python examples/pytorch/question-answering/run_qa.py \ --model_name_or_path csarron/bert-base-uncased-squad-v1 \ --dataset_name squad \ --do_eval \ diff --git a/docs/source/en/perf_infer_gpu_multi.md b/docs/source/en/perf_infer_gpu_multi.md new file mode 100644 index 00000000000000..9975094411527a --- /dev/null +++ b/docs/source/en/perf_infer_gpu_multi.md @@ -0,0 +1,68 @@ + + +# Multi-GPU inference + +Built-in Tensor Parallelism (TP) is now available with certain models using PyTorch. Tensor parallelism shards a model onto multiple GPUs, enabling larger model sizes, and parallelizes computations such as matrix multiplication. + +To enable tensor parallel, pass the argument `tp_plan="auto"` to [`~AutoModelForCausalLM.from_pretrained`]: + +```python +import os +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" + +# Initialize distributed +rank = int(os.environ["RANK"]) +device = torch.device(f"cuda:{rank}") +torch.distributed.init_process_group("nccl", device_id=device) + +# Retrieve tensor parallel model +model = AutoModelForCausalLM.from_pretrained( + model_id, + tp_plan="auto", +) + +# Prepare input tokens +tokenizer = AutoTokenizer.from_pretrained(model_id) +prompt = "Can I help" +inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(device) + +# Distributed run +outputs = model(inputs) +``` + +You can use `torchrun` to launch the above script with multiple processes, each mapping to a GPU: + +``` +torchrun --nproc-per-node 4 demo.py +``` + +PyTorch tensor parallel is currently supported for the following models: +* [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel) + +You can request to add tensor parallel support for another model by opening a GitHub Issue or Pull Request. + +### Expected speedups + +You can benefit from considerable speedups for inference, especially for inputs with large batch size or long sequences. + +For a single forward pass on [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel) with a sequence length of 512 and various batch sizes, the expected speedup is as follows: + +
+ +
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 84109746f95998..ab5e1c47a448f3 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -37,6 +37,7 @@ FlashAttention-2 is experimental and may change considerably in future versions. 2. partitioning the work between GPU threads to reduce communication and shared memory reads/writes between them FlashAttention-2 is currently supported for the following architectures: +* [Aria](https://huggingface.co/docs/transformers/model_doc/aria#transformers.AriaForConditionalGeneration) * [Bark](https://huggingface.co/docs/transformers/model_doc/bark#transformers.BarkModel) * [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel) * [Chameleon](https://huggingface.co/docs/transformers/model_doc/chameleon#transformers.Chameleon) @@ -77,7 +78,7 @@ FlashAttention-2 is currently supported for the following architectures: * [Nemotron](https://huggingface.co/docs/transformers/model_doc/nemotron) * [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb) * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel) -* [OLMo November 2024](https://huggingface.co/docs/transformers/model_doc/olmo_1124#transformers.Olmo1124Model) +* [OLMo2](https://huggingface.co/docs/transformers/model_doc/olmo2#transformers.Olmo2Model) * [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel) * [OPT](https://huggingface.co/docs/transformers/model_doc/opt#transformers.OPTModel) * [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration) @@ -216,6 +217,7 @@ PyTorch's [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.o For now, Transformers supports SDPA inference and training for the following architectures: * [Albert](https://huggingface.co/docs/transformers/model_doc/albert#transformers.AlbertModel) +* [Aria](https://huggingface.co/docs/transformers/model_doc/aria#transformers.AriaForConditionalGeneration) * [Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer#transformers.ASTModel) * [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel) * [Bert](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel) @@ -235,6 +237,7 @@ For now, Transformers supports SDPA inference and training for the following arc * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel) * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel) * [Gemma2](https://huggingface.co/docs/transformers/model_doc/gemma2#transformers.Gemma2Model) +* [Granite](https://huggingface.co/docs/transformers/model_doc/granite#transformers.GraniteModel) * [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2) * [GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode#transformers.GPTBigCodeModel) * [GPTNeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox#transformers.GPTNeoXModel) @@ -242,7 +245,7 @@ For now, Transformers supports SDPA inference and training for the following arc * [Idefics](https://huggingface.co/docs/transformers/model_doc/idefics#transformers.IdeficsModel) * [Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2#transformers.Idefics2Model) * [Idefics3](https://huggingface.co/docs/transformers/model_doc/idefics3#transformers.Idefics3Model) -* [Granite](https://huggingface.co/docs/transformers/model_doc/granite#transformers.GraniteModel) +* [I-JEPA](https://huggingface.co/docs/transformers/model_doc/ijepa#transformers.IJepaModel) * [GraniteMoe](https://huggingface.co/docs/transformers/model_doc/granitemoe#transformers.GraniteMoeModel) * [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel) * [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel) @@ -261,7 +264,7 @@ For now, Transformers supports SDPA inference and training for the following arc * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel) * [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb) * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel) -* [OLMo November 2024](https://huggingface.co/docs/transformers/model_doc/olmo_1124#transformers.Olmo1124Model) +* [OLMo2](https://huggingface.co/docs/transformers/model_doc/olmo2#transformers.Olmo2Model) * [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel) * [OPT](https://huggingface.co/docs/transformers/en/model_doc/opt) * [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration) @@ -405,7 +408,7 @@ To load a model in 4-bit for inference, use the `load_in_4bit` parameter. The `d from transformers import AutoModelForCausalLM model_name = "bigscience/bloom-2b5" -model_4bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True) +model_4bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", load_in_4bit=True) ``` To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 600MB of memory to the first GPU and 1GB of memory to the second GPU: @@ -414,7 +417,7 @@ To load a model in 4-bit for inference with multiple GPUs, you can control how m max_memory_mapping = {0: "600MB", 1: "1GB"} model_name = "bigscience/bloom-3b" model_4bit = AutoModelForCausalLM.from_pretrained( - model_name, device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping + model_name, torch_dtype="auto", device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping ) ``` @@ -432,7 +435,7 @@ To load a model in 8-bit for inference, use the `load_in_8bit` parameter. The `d from transformers import AutoModelForCausalLM, BitsAndBytesConfig model_name = "bigscience/bloom-2b5" -model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", quantization_config=BitsAndBytesConfig(load_in_8bit=True)) ``` If you're loading a model in 8-bit for text generation, you should use the [`~transformers.GenerationMixin.generate`] method instead of the [`Pipeline`] function which is not optimized for 8-bit models and will be slower. Some sampling strategies, like nucleus sampling, are also not supported by the [`Pipeline`] for 8-bit models. You should also place all inputs on the same device as the model: @@ -442,7 +445,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig model_name = "bigscience/bloom-2b5" tokenizer = AutoTokenizer.from_pretrained(model_name) -model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", quantization_config=BitsAndBytesConfig(load_in_8bit=True)) prompt = "Hello, my llama is cute" inputs = tokenizer(prompt, return_tensors="pt").to("cuda") @@ -456,7 +459,7 @@ To load a model in 4-bit for inference with multiple GPUs, you can control how m max_memory_mapping = {0: "1GB", 1: "2GB"} model_name = "bigscience/bloom-3b" model_8bit = AutoModelForCausalLM.from_pretrained( - model_name, device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping + model_name, torch_dtype="auto", device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping ) ``` @@ -515,7 +518,7 @@ quantization_config = BitsAndBytesConfig( ) tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") -model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", quantization_config=quantization_config) +model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype="auto", quantization_config=quantization_config) # enable BetterTransformer model = model.to_bettertransformer() diff --git a/docs/source/en/perf_torch_compile.md b/docs/source/en/perf_torch_compile.md index acc424930b1c4e..2155a403b2b77f 100644 --- a/docs/source/en/perf_torch_compile.md +++ b/docs/source/en/perf_torch_compile.md @@ -27,7 +27,7 @@ To compile any computer vision model of your choice, call `torch.compile()` on t ```diff from transformers import AutoModelForImageClassification -model = AutoModelForImageClassification.from_pretrained(MODEL_ID).to("cuda") +model = AutoModelForImageClassification.from_pretrained(MODEL_ID).to(DEVICE) + model = torch.compile(model) ``` @@ -47,15 +47,17 @@ from PIL import Image import requests import numpy as np from transformers import AutoImageProcessor, AutoModelForImageClassification +from accelerate.test_utils.testing import get_backend +device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) url = 'http://images.cocodataset.org/val2017/000000039769.jpg' image = Image.open(requests.get(url, stream=True).raw) processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") -model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224").to("cuda") +model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224").to(device) model = torch.compile(model) -processed_input = processor(image, return_tensors='pt').to(device="cuda") +processed_input = processor(image, return_tensors='pt').to(device) with torch.no_grad(): _ = model(**processed_input) @@ -66,13 +68,15 @@ with torch.no_grad(): ```python from transformers import AutoImageProcessor, AutoModelForObjectDetection +from accelerate.test_utils.testing import get_backend +device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50") -model = AutoModelForObjectDetection.from_pretrained("facebook/detr-resnet-50").to("cuda") +model = AutoModelForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(device) model = torch.compile(model) texts = ["a photo of a cat", "a photo of a dog"] -inputs = processor(text=texts, images=image, return_tensors="pt").to("cuda") +inputs = processor(text=texts, images=image, return_tensors="pt").to(device) with torch.no_grad(): _ = model(**inputs) @@ -82,11 +86,13 @@ with torch.no_grad(): ```python from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation +from accelerate.test_utils.testing import get_backend +device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) processor = SegformerImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") -model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512").to("cuda") +model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512").to(device) model = torch.compile(model) -seg_inputs = processor(images=image, return_tensors="pt").to("cuda") +seg_inputs = processor(images=image, return_tensors="pt").to(device) with torch.no_grad(): _ = model(**seg_inputs) diff --git a/docs/source/en/perf_train_cpu.md b/docs/source/en/perf_train_cpu.md index 7ef98932d537ac..ab2f735ecbdd50 100644 --- a/docs/source/en/perf_train_cpu.md +++ b/docs/source/en/perf_train_cpu.md @@ -51,7 +51,7 @@ To enable auto mixed precision with IPEX in Trainer, users should add `use_ipex` Take an example of the use cases on [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) - Training with IPEX using BF16 auto mixed precision on CPU: -
 python run_qa.py \
+
 python examples/pytorch/question-answering/run_qa.py \
 --model_name_or_path google-bert/bert-base-uncased \
 --dataset_name squad \
 --do_train \
diff --git a/docs/source/en/perf_train_cpu_many.md b/docs/source/en/perf_train_cpu_many.md
index ed782caca3b1f1..d6a029c471de08 100644
--- a/docs/source/en/perf_train_cpu_many.md
+++ b/docs/source/en/perf_train_cpu_many.md
@@ -75,7 +75,7 @@ The following command enables training with 2 processes on one Xeon node, with o
  export CCL_WORKER_COUNT=1
  export MASTER_ADDR=127.0.0.1
  mpirun -n 2 -genv OMP_NUM_THREADS=23 \
- python3 run_qa.py \
+ python3 examples/pytorch/question-answering/run_qa.py \
  --model_name_or_path google-bert/bert-large-uncased \
  --dataset_name squad \
  --do_train \
@@ -104,7 +104,7 @@ Now, run the following command in node0 and **4DDP** will be enabled in node0 an
  export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip
  mpirun -f hostfile -n 4 -ppn 2 \
  -genv OMP_NUM_THREADS=23 \
- python3 run_qa.py \
+ python3 examples/pytorch/question-answering/run_qa.py \
  --model_name_or_path google-bert/bert-large-uncased \
  --dataset_name squad \
  --do_train \
diff --git a/docs/source/en/perf_train_gpu_many.md b/docs/source/en/perf_train_gpu_many.md
index 858da99e7bc388..c810a18470a09e 100644
--- a/docs/source/en/perf_train_gpu_many.md
+++ b/docs/source/en/perf_train_gpu_many.md
@@ -553,7 +553,7 @@ It performs a sort of 4D Parallelism over Sample-Operator-Attribute-Parameter.
 Examples:
 * Sample
 
-Let's take 10 batches of sequence length 512. If we parallelize them by sample dimension into 2 devices, we get 10 x 512 which becomes be 5 x 2 x 512.
+Let's take 10 batches of sequence length 512. If we parallelize them by sample dimension into 2 devices, we get 10 x 512 which becomes 5 x 2 x 512.
 
 * Operator
 
diff --git a/docs/source/en/performance.md b/docs/source/en/performance.md
index 94e756cf33ada6..b9176be04ec206 100644
--- a/docs/source/en/performance.md
+++ b/docs/source/en/performance.md
@@ -53,7 +53,7 @@ sections we go through the steps to run inference on CPU and single/multi-GPU se
 
 * [Inference on a single CPU](perf_infer_cpu)
 * [Inference on a single GPU](perf_infer_gpu_one)
-* [Multi-GPU inference](perf_infer_gpu_one)
+* [Multi-GPU inference](perf_infer_gpu_multi)
 * [XLA Integration for TensorFlow Models](tf_xla)
 
 
diff --git a/docs/source/en/perplexity.md b/docs/source/en/perplexity.md
index ac7ef8504e72b6..525f0d567bcb61 100644
--- a/docs/source/en/perplexity.md
+++ b/docs/source/en/perplexity.md
@@ -73,8 +73,9 @@ Let's demonstrate this process with GPT-2.
 
 ```python
 from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+from accelerate.test_utils.testing import get_backend
 
-device = "cuda"
+device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
 model_id = "openai-community/gpt2-large"
 model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
 tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
diff --git a/docs/source/en/pipeline_tutorial.md b/docs/source/en/pipeline_tutorial.md
index 3363c68ea417a3..357bc7f636ec25 100644
--- a/docs/source/en/pipeline_tutorial.md
+++ b/docs/source/en/pipeline_tutorial.md
@@ -59,10 +59,10 @@ Let's try the [Whisper large-v2](https://huggingface.co/openai/whisper-large-v2)
 benchmarks. It also has the added benefit of predicting punctuation and casing, neither of which are possible with  
 Wav2Vec2.
 
-Let's give it a try here to see how it performs:
+Let's give it a try here to see how it performs. Set `torch_dtype="auto"` to automatically load the most memory-efficient data type the weights are stored in.
 
 ```py
->>> transcriber = pipeline(model="openai/whisper-large-v2")
+>>> transcriber = pipeline(model="openai/whisper-large-v2", torch_dtype="auto")
 >>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
 {'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
 ```
diff --git a/docs/source/en/quantization/bitsandbytes.md b/docs/source/en/quantization/bitsandbytes.md
index e9447555e82449..6c6b92d0a6e594 100644
--- a/docs/source/en/quantization/bitsandbytes.md
+++ b/docs/source/en/quantization/bitsandbytes.md
@@ -64,7 +64,7 @@ model_8bit = AutoModelForCausalLM.from_pretrained(
 )
 ```
 
-By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want:
+By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want. Setting `torch_dtype="auto"` loads the model in the data type defined in a model's `config.json` file.
 
 ```py
 import torch
@@ -75,7 +75,7 @@ quantization_config = BitsAndBytesConfig(load_in_8bit=True)
 model_8bit = AutoModelForCausalLM.from_pretrained(
     "facebook/opt-350m", 
     quantization_config=quantization_config, 
-    torch_dtype=torch.float32
+    torch_dtype="auto"
 )
 model_8bit.model.decoder.layers[-1].final_layer_norm.weight.dtype
 ```
@@ -112,7 +112,7 @@ model_4bit = AutoModelForCausalLM.from_pretrained(
 )
 ```
 
-By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want:
+By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want. Setting `torch_dtype="auto"` loads the model in the data type defined in a model's `config.json` file.
 
 ```py
 import torch
@@ -123,7 +123,7 @@ quantization_config = BitsAndBytesConfig(load_in_4bit=True)
 model_4bit = AutoModelForCausalLM.from_pretrained(
     "facebook/opt-350m",
     quantization_config=quantization_config, 
-    torch_dtype=torch.float32
+    torch_dtype="auto"
 )
 model_4bit.model.decoder.layers[-1].final_layer_norm.weight.dtype
 ```
@@ -190,6 +190,7 @@ Now load your model with the custom `device_map` and `quantization_config`:
 ```py
 model_8bit = AutoModelForCausalLM.from_pretrained(
     "bigscience/bloom-1b7",
+    torch_dtype="auto",
     device_map=device_map,
     quantization_config=quantization_config,
 )
@@ -212,6 +213,7 @@ quantization_config = BitsAndBytesConfig(
 
 model_8bit = AutoModelForCausalLM.from_pretrained(
     model_id,
+    torch_dtype="auto",
     device_map=device_map,
     quantization_config=quantization_config,
 )
@@ -232,6 +234,7 @@ quantization_config = BitsAndBytesConfig(
 
 model_8bit = AutoModelForCausalLM.from_pretrained(
     model_id,
+    torch_dtype="auto",
     device_map="auto",
     quantization_config=quantization_config,
 )
@@ -275,7 +278,7 @@ nf4_config = BitsAndBytesConfig(
     bnb_4bit_quant_type="nf4",
 )
 
-model_nf4 = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config)
+model_nf4 = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", quantization_config=nf4_config)
 ```
 
 For inference, the `bnb_4bit_quant_type` does not have a huge impact on performance. However, to remain consistent with the model weights, you should use the `bnb_4bit_compute_dtype` and `torch_dtype` values.
@@ -292,7 +295,7 @@ double_quant_config = BitsAndBytesConfig(
     bnb_4bit_use_double_quant=True,
 )
 
-model_double_quant = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b", quantization_config=double_quant_config)
+model_double_quant = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b", torch_dtype="auto", quantization_config=double_quant_config)
 ```
 
 ## Dequantizing `bitsandbytes` models
diff --git a/docs/source/en/quantization/fbgemm_fp8.md b/docs/source/en/quantization/fbgemm_fp8.md
index ff9e18f823c935..61cf8a059bf277 100644
--- a/docs/source/en/quantization/fbgemm_fp8.md
+++ b/docs/source/en/quantization/fbgemm_fp8.md
@@ -33,13 +33,14 @@ pip install --upgrade accelerate fbgemm-gpu torch
 
 If you are having issues with fbgemm-gpu and torch library, you might need to install the nightly release. You can follow the instruction [here](https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-libraries:~:text=found%20here.-,Install%20the%20FBGEMM_GPU%20Package,-Install%20through%20PyTorch)
 
+By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type.
 
 ```py
 from transformers import FbgemmFp8Config, AutoModelForCausalLM, AutoTokenizer
 
 model_name = "meta-llama/Meta-Llama-3-8B"
 quantization_config = FbgemmFp8Config()
-quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=quantization_config)
+quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
 
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 input_text = "What are we having for dinner?"
diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md
index 91c6ebd40dab4d..0fb72d26058e55 100644
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@@ -55,7 +55,7 @@ Use the table below to help you decide which quantization method to use.
 | GGUF / GGML (llama.cpp)             | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴         | 🔴                       | 1 - 8          | 🔴                                   | [See GGUF section](../gguf)                | [See GGUF section](../gguf)                      | https://github.com/ggerganov/llama.cpp      |
 | [GPTQ](./gptq)                                | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | 🔴         | 🔴                       | 2 - 3 - 4 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
 | [HQQ](./hqq)                                 | 🟢                       | 🟢    | 🟢        | 🔴              | 🔴                     | 🔴         | 🟢                       | 1 - 8          | 🟢                                   | 🔴            | 🟢                      | https://github.com/mobiusml/hqq/            |
-| [Quanto](./quanto)                              | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴         | 🟢                       | 2 / 4 / 8      | 🔴                                   | 🔴            | 🟢                      | https://github.com/huggingface/quanto       |
+| [optimum-quanto](./quanto)                              | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴         | 🟢                       | 2 / 4 / 8      | 🔴                                   | 🔴            | 🟢                      | https://github.com/huggingface/optimum-quanto       |
 | [FBGEMM_FP8](./fbgemm_fp8.md)                              | 🟢                       | 🔴    | 🟢        | 🔴              | 🔴                      | 🔴         | 🔴                        | 8      | 🔴                                   | 🟢            | 🟢                      | https://github.com/pytorch/FBGEMM       |
 | [torchao](./torchao.md)                              | 🟢                       |     | 🟢        | 🔴              | partial support (int4 weight only)       | 🔴         |                       | 4 / 8      |                                   | 🟢🔴           | 🟢                      | https://github.com/pytorch/ao       |
 
diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md
index 18135b2ec2fce7..7feadefd83d2aa 100644
--- a/docs/source/en/quantization/quanto.md
+++ b/docs/source/en/quantization/quanto.md
@@ -14,21 +14,21 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Quanto
+# Optimum-quanto
 
 
 
-Try Quanto + transformers with this [notebook](https://colab.research.google.com/drive/16CXfVmtdQvciSh9BopZUDYcmXCDpvgrT?usp=sharing)!
+Try optimum-quanto + transformers with this [notebook](https://colab.research.google.com/drive/16CXfVmtdQvciSh9BopZUDYcmXCDpvgrT?usp=sharing)!
 
 
 
 
-[🤗 Quanto](https://github.com/huggingface/quanto) library is a versatile pytorch quantization toolkit. The quantization method used is the linear quantization. Quanto provides several unique features such as:
+[🤗 optimum-quanto](https://github.com/huggingface/optimum-quanto) library is a versatile pytorch quantization toolkit. The quantization method used is the linear quantization. Quanto provides several unique features such as:
 
 - weights quantization (`float8`,`int8`,`int4`,`int2`)
 - activation quantization (`float8`,`int8`)
 - modality agnostic (e.g CV,LLM)
-- device agnostic (e.g CUDA,MPS,CPU)
+- device agnostic (e.g CUDA,XPU,MPS,CPU)
 - compatibility with `torch.compile`
 - easy to add custom kernel for specific device
 - supports quantization aware training
@@ -37,12 +37,14 @@ Try Quanto + transformers with this [notebook](https://colab.research.google.com
 Before you begin, make sure the following libraries are installed:
 
 ```bash
-pip install quanto accelerate transformers
+pip install optimum-quanto accelerate transformers
 ```
 
 Now you can quantize a model by passing [`QuantoConfig`] object in the [`~PreTrainedModel.from_pretrained`] method. This works for any model in any modality, as long as it contains `torch.nn.Linear` layers. 
 
-The integration with transformers only supports weights quantization. For the more complex use case such as activation quantization, calibration and quantization aware training, you should use [quanto](https://github.com/huggingface/quanto) library instead. 
+The integration with transformers only supports weights quantization. For the more complex use case such as activation quantization, calibration and quantization aware training, you should use [optimum-quanto](https://github.com/huggingface/optimum-quanto) library instead.
+
+By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type.
 
 ```py
 from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
@@ -50,12 +52,12 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
 model_id = "facebook/opt-125m"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 quantization_config = QuantoConfig(weights="int8")
-quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda:0", quantization_config=quantization_config)
+quantized_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="cuda:0", quantization_config=quantization_config)
 ```
 
 Note that serialization is not supported yet with transformers but it is coming soon! If you want to save the model, you can use quanto library instead.
 
-Quanto library uses linear quantization algorithm for quantization. Even though this is a basic quantization technique, we get very good results! Have a look at the following benchmark (llama-2-7b on perplexity metric). You can find more benchmarks [here](https://github.com/huggingface/quanto/tree/main/bench/generation)
+Optimum-quanto library uses linear quantization algorithm for quantization. Even though this is a basic quantization technique, we get very good results! Have a look at the following benchmark (llama-2-7b on perplexity metric). You can find more benchmarks [here](https://github.com/huggingface/optimum-quanto/tree/main/bench/generation)
 
 
diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md index cd1d0188c33eb5..38f7c074c97d90 100644 --- a/docs/source/en/quantization/torchao.md +++ b/docs/source/en/quantization/torchao.md @@ -19,6 +19,7 @@ Before you begin, make sure the following libraries are installed with their lat pip install --upgrade torch torchao ``` +By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type. ```py import torch @@ -28,7 +29,7 @@ model_name = "meta-llama/Meta-Llama-3-8B" # We support int4_weight_only, int8_weight_only and int8_dynamic_activation_int8_weight # More examples and documentations for arguments can be found in https://github.com/pytorch/ao/tree/main/torchao/quantization#other-available-quantization-techniques quantization_config = TorchAoConfig("int4_weight_only", group_size=128) -quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=quantization_config) +quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config) tokenizer = AutoTokenizer.from_pretrained(model_name) input_text = "What are we having for dinner?" diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md index 404b6eac7fe44b..70afa1ea57107f 100755 --- a/docs/source/en/quicktour.md +++ b/docs/source/en/quicktour.md @@ -245,13 +245,15 @@ Check out the [preprocess](./preprocessing) tutorial for more details about toke -🤗 Transformers provides a simple and unified way to load pretrained instances. This means you can load an [`AutoModel`] like you would load an [`AutoTokenizer`]. The only difference is selecting the correct [`AutoModel`] for the task. For text (or sequence) classification, you should load [`AutoModelForSequenceClassification`]: +🤗 Transformers provides a simple and unified way to load pretrained instances. This means you can load an [`AutoModel`] like you would load an [`AutoTokenizer`]. The only difference is selecting the correct [`AutoModel`] for the task. For text (or sequence) classification, you should load [`AutoModelForSequenceClassification`]. + +By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type. ```py >>> from transformers import AutoModelForSequenceClassification >>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" ->>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name) +>>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype="auto") ``` @@ -416,12 +418,12 @@ All models are a standard [`torch.nn.Module`](https://pytorch.org/docs/stable/nn Depending on your task, you'll typically pass the following parameters to [`Trainer`]: -1. You'll start with a [`PreTrainedModel`] or a [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module): +1. You'll start with a [`PreTrainedModel`] or a [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module). Set `torch_dtype="auto"` to automatically load the most memory-efficient data type the weights are stored in. ```py >>> from transformers import AutoModelForSequenceClassification - >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") + >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype="auto") ``` 2. [`TrainingArguments`] contains the model hyperparameters you can change like learning rate, batch size, and the number of epochs to train for. The default values are used if you don't specify any training arguments: diff --git a/docs/source/en/tasks/image_classification.md b/docs/source/en/tasks/image_classification.md index 514ec3fbfe0b93..49fdc9db60d4d7 100644 --- a/docs/source/en/tasks/image_classification.md +++ b/docs/source/en/tasks/image_classification.md @@ -26,7 +26,7 @@ after a natural disaster, monitoring crop health, or helping screen medical imag This guide illustrates how to: -1. Fine-tune [ViT](model_doc/vit) on the [Food-101](https://huggingface.co/datasets/food101) dataset to classify a food item in an image. +1. Fine-tune [ViT](../model_doc/vit) on the [Food-101](https://huggingface.co/datasets/food101) dataset to classify a food item in an image. 2. Use your fine-tuned model for inference. diff --git a/docs/source/en/tasks/image_feature_extraction.md b/docs/source/en/tasks/image_feature_extraction.md index 80b701588b26b4..e55a9e2379d531 100644 --- a/docs/source/en/tasks/image_feature_extraction.md +++ b/docs/source/en/tasks/image_feature_extraction.md @@ -84,7 +84,7 @@ If you want to get the last hidden states before pooling, avoid passing any valu ```python pipe = pipeline(task="image-feature-extraction", model_name="google/vit-base-patch16-224", device=DEVICE) -output = pipe(image_real) +outputs = pipe(image_real) ``` Since the outputs are unpooled, we get the last hidden states where the first dimension is the batch size, and the last two are the embedding shape. diff --git a/docs/source/en/tasks/image_text_to_text.md b/docs/source/en/tasks/image_text_to_text.md index 261abf947290d1..28bd98457ee016 100644 --- a/docs/source/en/tasks/image_text_to_text.md +++ b/docs/source/en/tasks/image_text_to_text.md @@ -120,6 +120,46 @@ print(generated_texts) ## ['User: What do we see in this image? \nAssistant: In this image we can see two cats on the nets. \nUser: And how about this image? \nAssistant: In this image we can see flowers, plants and insect.'] ``` +## Pipeline + +The fastest way to get started is to use the [`Pipeline`] API. Specify the `"image-text-to-text"` task and the model you want to use. + +```python +from transformers import pipeline +pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf") +``` + +The example below uses chat templates to format the text inputs. + +```python +messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg", + }, + {"type": "text", "text": "Describe this image."}, + ], + }, + { + "role": "assistant", + "content": [ + {"type": "text", "text": "There's a pink flower"}, + ], + }, + ] +``` + +Pass the chat template formatted text and image to [`Pipeline`] and set `return_full_text=False` to remove the input from the generated output. + +```python +outputs = pipe(text=messages, max_new_tokens=20, return_full_text=False) +outputs[0]["generated_text"] +# with a yellow center in the foreground. The flower is surrounded by red and white flowers with green stems +``` + ## Streaming We can use [text streaming](./generation_strategies#streaming) for a better generation experience. Transformers supports streaming with the [`TextStreamer`] or [`TextIteratorStreamer`] classes. We will use the [`TextIteratorStreamer`] with IDEFICS-8B. @@ -189,7 +229,7 @@ Now let's call the `model_inference` function we created and stream the values. ```python generator = model_inference( user_prompt="And what is in this image?", - chat_history=messages, + chat_history=messages[:2], max_new_tokens=100, images=images ) diff --git a/docs/source/en/tasks/knowledge_distillation_for_image_classification.md b/docs/source/en/tasks/knowledge_distillation_for_image_classification.md index 17fb363df8e2a0..c1ccafb6fc5d2a 100644 --- a/docs/source/en/tasks/knowledge_distillation_for_image_classification.md +++ b/docs/source/en/tasks/knowledge_distillation_for_image_classification.md @@ -17,7 +17,7 @@ rendered properly in your Markdown viewer. [[open-in-colab]] -Knowledge distillation is a technique used to transfer knowledge from a larger, more complex model (teacher) to a smaller, simpler model (student). To distill knowledge from one model to another, we take a pre-trained teacher model trained on a certain task (image classification for this case) and randomly initialize a student model to be trained on image classification. Next, we train the student model to minimize the difference between it's outputs and the teacher's outputs, thus making it mimic the behavior. It was first introduced in [Distilling the Knowledge in a Neural Network by Hinton et al](https://arxiv.org/abs/1503.02531). In this guide, we will do task-specific knowledge distillation. We will use the [beans dataset](https://huggingface.co/datasets/beans) for this. +Knowledge distillation is a technique used to transfer knowledge from a larger, more complex model (teacher) to a smaller, simpler model (student). To distill knowledge from one model to another, we take a pre-trained teacher model trained on a certain task (image classification for this case) and randomly initialize a student model to be trained on image classification. Next, we train the student model to minimize the difference between its outputs and the teacher's outputs, thus making it mimic the behavior. It was first introduced in [Distilling the Knowledge in a Neural Network by Hinton et al](https://arxiv.org/abs/1503.02531). In this guide, we will do task-specific knowledge distillation. We will use the [beans dataset](https://huggingface.co/datasets/beans) for this. This guide demonstrates how you can distill a [fine-tuned ViT model](https://huggingface.co/merve/vit-mobilenet-beans-224) (teacher model) to a [MobileNet](https://huggingface.co/google/mobilenet_v2_1.4_224) (student model) using the [Trainer API](https://huggingface.co/docs/transformers/en/main_classes/trainer#trainer) of 🤗 Transformers. diff --git a/docs/source/en/tasks/video_text_to_text.md b/docs/source/en/tasks/video_text_to_text.md index fcc1c86e8bd7ac..3929f7994bdafb 100644 --- a/docs/source/en/tasks/video_text_to_text.md +++ b/docs/source/en/tasks/video_text_to_text.md @@ -47,7 +47,7 @@ model_id = "llava-hf/llava-interleave-qwen-0.5b-hf" processor = LlavaProcessor.from_pretrained(model_id) model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16) -model.to("cuda") +model.to("cuda") # can also be xpu, mps, npu etc. depending on your hardware accelerator ``` Some models directly consume the ` \ No newline at end of file + diff --git a/docs/source/en/testing.md b/docs/source/en/testing.md index 1da8a62456ee2c..9e85f2248e16fc 100644 --- a/docs/source/en/testing.md +++ b/docs/source/en/testing.md @@ -428,7 +428,7 @@ pytest --instafail ### To GPU or not to GPU -On a GPU-enabled setup, to test in CPU-only mode add `CUDA_VISIBLE_DEVICES=""`: +On a GPU-enabled setup, to test in CPU-only mode add `CUDA_VISIBLE_DEVICES=""` for CUDA GPUs: ```bash CUDA_VISIBLE_DEVICES="" pytest tests/utils/test_logging.py @@ -441,10 +441,12 @@ second gpu if you have gpus `0` and `1`, you can run: CUDA_VISIBLE_DEVICES="1" pytest tests/utils/test_logging.py ``` +For Intel GPUs, use `ZE_AFFINITY_MASK` instead of `CUDA_VISIBLE_DEVICES` in the above example. + This is handy when you want to run different tasks on different GPUs. Some tests must be run on CPU-only, others on either CPU or GPU or TPU, yet others on multiple-GPUs. The following skip -decorators are used to set the requirements of tests CPU/GPU/TPU-wise: +decorators are used to set the requirements of tests CPU/GPU/XPU/TPU-wise: - `require_torch` - this test will run only under torch - `require_torch_gpu` - as `require_torch` plus requires at least 1 GPU diff --git a/docs/source/en/tiktoken.md b/docs/source/en/tiktoken.md index 528ff4f76dc5f6..aac81e24fdd738 100644 --- a/docs/source/en/tiktoken.md +++ b/docs/source/en/tiktoken.md @@ -36,3 +36,25 @@ from transformers import AutoTokenizer model_id = "meta-llama/Meta-Llama-3-8B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_id, subfolder="original") ``` +## Create tiktoken tokenizer + +The `tokenizer.model` file contains no information about additional tokens or pattern strings. If these are important, convert the tokenizer to `tokenizer.json`, the appropriate format for [`PreTrainedTokenizerFast`]. + +Generate the `tokenizer.model` file with [tiktoken.get_encoding](https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/registry.py#L63) and then convert it to `tokenizer.json` with [`convert_tiktoken_to_fast`]. + +```py + +from transformers.integrations.tiktoken import convert_tiktoken_to_fast +from tiktoken import get_encoding + +# You can load your custom encoding or the one provided by OpenAI +encoding = get_encoding("gpt2") +convert_tiktoken_to_fast(encoding, "config/save/dir") +``` + +The resulting `tokenizer.json` file is saved to the specified directory and can be loaded with [`PreTrainedTokenizerFast`]. + +```py +tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir") +``` + diff --git a/docs/source/en/trainer.md b/docs/source/en/trainer.md index 7bee3472892727..e3a66f42042485 100644 --- a/docs/source/en/trainer.md +++ b/docs/source/en/trainer.md @@ -174,7 +174,7 @@ trainer = Trainer( processing_class=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, - callback=[EarlyStoppingCallback()], + callbacks=[EarlyStoppingCallback()], ) ``` diff --git a/docs/source/en/training.md b/docs/source/en/training.md index aacf174fbd6be0..fa6ef0c0da6e4b 100644 --- a/docs/source/en/training.md +++ b/docs/source/en/training.md @@ -81,12 +81,14 @@ just use the button at the top-right of that framework's block! 🤗 Transformers provides a [`Trainer`] class optimized for training 🤗 Transformers models, making it easier to start training without manually writing your own training loop. The [`Trainer`] API supports a wide range of training options and features such as logging, gradient accumulation, and mixed precision. -Start by loading your model and specify the number of expected labels. From the Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields), you know there are five labels: +Start by loading your model and specify the number of expected labels. From the Yelp Review [dataset card](https://huggingface.co/datasets/yelp_review_full#data-fields), you know there are five labels. + +By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type. ```py >>> from transformers import AutoModelForSequenceClassification ->>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5) +>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5, torch_dtype="auto") ``` @@ -287,9 +289,10 @@ model.fit(tf_dataset) At this point, you may need to restart your notebook or execute the following code to free some memory: ```py +from accelerate.utils.memory import clear_device_cache del model del trainer -torch.cuda.empty_cache() +clear_device_cache() ``` Next, manually postprocess `tokenized_dataset` to prepare it for training. @@ -364,8 +367,9 @@ Lastly, specify `device` to use a GPU if you have access to one. Otherwise, trai ```py >>> import torch +>>> from accelerate.test_utils.testing import get_backend ->>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +>>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) >>> model.to(device) ``` diff --git a/docs/source/fr/installation.md b/docs/source/fr/installation.md index bbc93d810f0df1..da3e585df7aee3 100644 --- a/docs/source/fr/installation.md +++ b/docs/source/fr/installation.md @@ -159,7 +159,7 @@ conda install conda-forge::transformers Les modèles pré-entraînés sont téléchargés et mis en cache localement dans le dossier suivant : `~/.cache/huggingface/hub`. C'est le dossier par défaut donné par la variable d'environnement `TRANSFORMERS_CACHE`. Sur Windows, le dossier par défaut est `C:\Users\nom_utilisateur\.cache\huggingface\hub`. Vous pouvez modifier les variables d'environnement indiquées ci-dessous - par ordre de priorité - pour spécifier un dossier de cache différent : -1. Variable d'environnement (par défaut) : `HUGGINGFACE_HUB_CACHE` ou `TRANSFORMERS_CACHE`. +1. Variable d'environnement (par défaut) : `HF_HUB_CACHE` ou `TRANSFORMERS_CACHE`. 2. Variable d'environnement : `HF_HOME`. 3. Variable d'environnement : `XDG_CACHE_HOME` + `/huggingface`. diff --git a/docs/source/ja/installation.md b/docs/source/ja/installation.md index a0b9dfe3bdbd7a..cb4919caf50571 100644 --- a/docs/source/ja/installation.md +++ b/docs/source/ja/installation.md @@ -145,7 +145,7 @@ conda install conda-forge::transformers 学習済みモデルはダウンロードされ、ローカルにキャッシュされます: `~/.cache/huggingface/hub`. これはシェル環境変数`TRANSFORMERS_CACHE`で指定されるデフォルトのディレクトリです。Windowsでは、デフォルトのディレクトリは`C:\Users\username\.cache\huggingface\hub`になっています。異なるキャッシュディレクトリを指定するために、以下のシェル環境変数を変更することが可能です。優先度は以下の順番に対応します: -1. シェル環境変数 (デフォルト): `HUGGINGFACE_HUB_CACHE` または `TRANSFORMERS_CACHE`. +1. シェル環境変数 (デフォルト): `HF_HUB_CACHE` または `TRANSFORMERS_CACHE`. 2. シェル環境変数: `HF_HOME`. 3. シェル環境変数: `XDG_CACHE_HOME` + `/huggingface`. diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index e924ebe34d1c2a..7e9567769cca1a 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -380,8 +380,8 @@ title: (번역중) DPR - local: in_translation title: (번역중) ELECTRA - - local: in_translation - title: (번역중) Encoder Decoder Models + - local: model_doc/encoder-decoder + title: 인코더 디코더 모델 - local: in_translation title: (번역중) ERNIE - local: in_translation diff --git a/docs/source/ko/installation.md b/docs/source/ko/installation.md index 1583e994d6afe3..a744db40d29123 100644 --- a/docs/source/ko/installation.md +++ b/docs/source/ko/installation.md @@ -145,7 +145,7 @@ conda install conda-forge::transformers 사전훈련된 모델은 다운로드된 후 로컬 경로 `~/.cache/huggingface/hub`에 캐시됩니다. 셸 환경 변수 `TRANSFORMERS_CACHE`의 기본 디렉터리입니다. Windows의 경우 기본 디렉터리는 `C:\Users\username\.cache\huggingface\hub`입니다. 아래의 셸 환경 변수를 (우선 순위) 순서대로 변경하여 다른 캐시 디렉토리를 지정할 수 있습니다. -1. 셸 환경 변수 (기본): `HUGGINGFACE_HUB_CACHE` 또는 `TRANSFORMERS_CACHE` +1. 셸 환경 변수 (기본): `HF_HUB_CACHE` 또는 `TRANSFORMERS_CACHE` 2. 셸 환경 변수: `HF_HOME` 3. 셸 환경 변수: `XDG_CACHE_HOME` + `/huggingface` diff --git a/docs/source/ko/model_doc/encoder-decoder.md b/docs/source/ko/model_doc/encoder-decoder.md new file mode 100644 index 00000000000000..c5c55356139536 --- /dev/null +++ b/docs/source/ko/model_doc/encoder-decoder.md @@ -0,0 +1,167 @@ + + +# 인코더-디코더 모델[[Encoder Decoder Models]] + +## 개요[[Overview]] + +[`EncoderDecoderModel`]은 사전 학습된 자동 인코딩(autoencoding) 모델을 인코더로, 사전 학습된 자가 회귀(autoregressive) 모델을 디코더로 활용하여 시퀀스-투-시퀀스(sequence-to-sequence) 모델을 초기화하는 데 이용됩니다. + +사전 학습된 체크포인트를 활용해 시퀀스-투-시퀀스 모델을 초기화하는 것이 시퀀스 생성(sequence generation) 작업에 효과적이라는 점이 Sascha Rothe, Shashi Narayan, Aliaksei Severyn의 논문 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461)에서 입증되었습니다. + +[`EncoderDecoderModel`]이 학습/미세 조정된 후에는 다른 모델과 마찬가지로 저장/불러오기가 가능합니다. 자세한 사용법은 예제를 참고하세요. + +이 아키텍처의 한 가지 응용 사례는 두 개의 사전 학습된 [`BertModel`]을 각각 인코더와 디코더로 활용하여 요약 모델(summarization model)을 구축하는 것입니다. 이는 Yang Liu와 Mirella Lapata의 논문 [Text Summarization with Pretrained Encoders](https://arxiv.org/abs/1908.08345)에서 제시된 바 있습니다. + +## 모델 설정에서 `EncoderDecoderModel`을 무작위 초기화하기[[Randomly initializing `EncoderDecoderModel` from model configurations.]] + +[`EncoderDecoderModel`]은 인코더와 디코더 설정(config)을 기반으로 무작위 초기화를 할 수 있습니다. 아래 예시는 [`BertModel`] 설정을 인코더로, 기본 [`BertForCausalLM`] 설정을 디코더로 사용하는 방법을 보여줍니다. + +```python +>>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel + +>>> config_encoder = BertConfig() +>>> config_decoder = BertConfig() + +>>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder) +>>> model = EncoderDecoderModel(config=config) +``` + +## 사전 학습된 인코더와 디코더로 `EncoderDecoderModel` 초기화하기[[Initialising `EncoderDecoderModel` from a pretrained encoder and a pretrained decoder.]] + +[`EncoderDecoderModel`]은 사전 학습된 인코더 체크포인트와 사전 학습된 디코더 체크포인트를 사용해 초기화할 수 있습니다. BERT와 같은 모든 사전 학습된 자동 인코딩(auto-encoding) 모델은 인코더로 활용할 수 있으며, GPT2와 같은 자가 회귀(autoregressive) 모델이나 BART의 디코더와 같이 사전 학습된 시퀀스-투-시퀀스 디코더 모델을 디코더로 사용할 수 있습니다. 디코더로 선택한 아키텍처에 따라 교차 어텐션(cross-attention) 레이어가 무작위로 초기화될 수 있습니다. 사전 학습된 인코더와 디코더 체크포인트를 이용해 [`EncoderDecoderModel`]을 초기화하려면, 모델을 다운스트림 작업에 대해 미세 조정(fine-tuning)해야 합니다. 이에 대한 자세한 내용은 [the *Warm-starting-encoder-decoder blog post*](https://huggingface.co/blog/warm-starting-encoder-decoder)에 설명되어 있습니다. 이 작업을 위해 `EncoderDecoderModel` 클래스는 [`EncoderDecoderModel.from_encoder_decoder_pretrained`] 메서드를 제공합니다. + + +```python +>>> from transformers import EncoderDecoderModel, BertTokenizer + +>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") +>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-uncased", "google-bert/bert-base-uncased") +``` + +## 기존 `EncoderDecoderModel` 체크포인트 불러오기 및 추론하기[[Loading an existing `EncoderDecoderModel` checkpoint and perform inference.]] + +`EncoderDecoderModel` 클래스의 미세 조정(fine-tuned)된 체크포인트를 불러오려면, Transformers의 다른 모델 아키텍처와 마찬가지로 [`EncoderDecoderModel`]에서 제공하는 `from_pretrained(...)`를 사용할 수 있습니다. + +추론을 수행하려면 [`generate`] 메서드를 활용하여 텍스트를 자동 회귀(autoregressive) 방식으로 생성할 수 있습니다. 이 메서드는 탐욕 디코딩(greedy decoding), 빔 서치(beam search), 다항 샘플링(multinomial sampling) 등 다양한 디코딩 방식을 지원합니다. + +```python +>>> from transformers import AutoTokenizer, EncoderDecoderModel + +>>> # 미세 조정된 seq2seq 모델과 대응하는 토크나이저 가져오기 +>>> model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail") +>>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail") + +>>> # let's perform inference on a long piece of text +>>> ARTICLE_TO_SUMMARIZE = ( +... "PG&E stated it scheduled the blackouts in response to forecasts for high winds " +... "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were " +... "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow." +... ) +>>> input_ids = tokenizer(ARTICLE_TO_SUMMARIZE, return_tensors="pt").input_ids + +>>> # 자기회귀적으로 요약 생성 (기본적으로 그리디 디코딩 사용) +>>> generated_ids = model.generate(input_ids) +>>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +>>> print(generated_text) +nearly 800 thousand customers were affected by the shutoffs. the aim is to reduce the risk of wildfires. nearly 800, 000 customers were expected to be affected by high winds amid dry conditions. pg & e said it scheduled the blackouts to last through at least midday tomorrow. +``` + +## `TFEncoderDecoderModel`에 Pytorch 체크포인트 불러오기[[Loading a PyTorch checkpoint into `TFEncoderDecoderModel`.]] + +[`TFEncoderDecoderModel.from_pretrained`] 메서드는 현재 Pytorch 체크포인트를 사용한 모델 초기화를 지원하지 않습니다. 이 메서드에 `from_pt=True`를 전달하면 예외(exception)가 발생합니다. 특정 인코더-디코더 모델에 대한 Pytorch 체크포인트만 존재하는 경우, 다음과 같은 해결 방법을 사용할 수 있습니다: + +```python +>>> # 파이토치 체크포인트에서 로드하는 해결 방법 +>>> from transformers import EncoderDecoderModel, TFEncoderDecoderModel + +>>> _model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16") + +>>> _model.encoder.save_pretrained("./encoder") +>>> _model.decoder.save_pretrained("./decoder") + +>>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained( +... "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True +... ) +>>> # 이 부분은 특정 모델의 구체적인 세부사항을 복사할 때에만 사용합니다. +>>> model.config = _model.config +``` + +## 학습[[Training]] + +모델이 생성된 후에는 BART, T5 또는 기타 인코더-디코더 모델과 유사한 방식으로 미세 조정(fine-tuning)할 수 있습니다. +보시다시피, 손실(loss)을 계산하려면 단 2개의 입력만 필요합니다: `input_ids`(입력 시퀀스를 인코딩한 `input_ids`)와 `labels`(목표 시퀀스를 인코딩한 `input_ids`). + +```python +>>> from transformers import BertTokenizer, EncoderDecoderModel + +>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") +>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-uncased", "google-bert/bert-base-uncased") + +>>> model.config.decoder_start_token_id = tokenizer.cls_token_id +>>> model.config.pad_token_id = tokenizer.pad_token_id + +>>> input_ids = tokenizer( +... "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side.During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft).Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.", +... return_tensors="pt", +... ).input_ids + +>>> labels = tokenizer( +... "the eiffel tower surpassed the washington monument to become the tallest structure in the world. it was the first structure to reach a height of 300 metres in paris in 1930. it is now taller than the chrysler building by 5. 2 metres ( 17 ft ) and is the second tallest free - standing structure in paris.", +... return_tensors="pt", +... ).input_ids + +>>> # forward 함수가 자동으로 적합한 decoder_input_ids를 생성합니다. +>>> loss = model(input_ids=input_ids, labels=labels).loss +``` +훈련에 대한 자세한 내용은 [colab](https://colab.research.google.com/drive/1WIk2bxglElfZewOHboPFNj8H44_VAyKE?usp=sharing#scrollTo=ZwQIEhKOrJpl) 노트북을 참조하세요. + +이 모델은 [thomwolf](https://github.com/thomwolf)가 기여했으며, 이 모델에 대한 TensorFlow 및 Flax 버전은 [ydshieh](https://github.com/ydshieh)가 기여했습니다. + + +## EncoderDecoderConfig + +[[autodoc]] EncoderDecoderConfig + + + + +## EncoderDecoderModel + +[[autodoc]] EncoderDecoderModel + - forward + - from_encoder_decoder_pretrained + + + + +## TFEncoderDecoderModel + +[[autodoc]] TFEncoderDecoderModel + - call + - from_encoder_decoder_pretrained + + + + +## FlaxEncoderDecoderModel + +[[autodoc]] FlaxEncoderDecoderModel + - __call__ + - from_encoder_decoder_pretrained + + + diff --git a/docs/source/zh/_toctree.yml b/docs/source/zh/_toctree.yml index 07c97e51550cb7..d4863efde710ea 100644 --- a/docs/source/zh/_toctree.yml +++ b/docs/source/zh/_toctree.yml @@ -52,6 +52,10 @@ title: 导出为 TorchScript - local: gguf title: 与 GGUF 格式的互操作性 + - local: tiktoken + title: 与 Tiktoken 文件的互操作性 + - local: community + title: 社区资源 title: 开发者指南 - sections: - local: performance @@ -59,6 +63,8 @@ - sections: - local: fsdp title: 完全分片数据并行 + - local: perf_train_special + title: 在 Apple silicon 芯片上进行 PyTorch 训练 - local: perf_hardware title: 用于训练的定制硬件 - local: hpo_train @@ -86,6 +92,10 @@ title: 🤗Transformers能做什么 - local: tokenizer_summary title: 分词器的摘要 + - local: attention + title: 注意力机制 + - local: bertology + title: 基于BERT进行的相关研究 title: 概念指南 - sections: - sections: diff --git a/docs/source/zh/attention.md b/docs/source/zh/attention.md new file mode 100644 index 00000000000000..357a574a2d2e69 --- /dev/null +++ b/docs/source/zh/attention.md @@ -0,0 +1,37 @@ + + +# 注意力机制 + +大多数 transformer 模型使用完全注意力机制,该机制采用正方形的注意力矩阵。当输入很长的文本时,这将导致巨大的计算瓶颈。Longformer 和 Reformer 是提高注意力机制效率的改进模型,它们使用稀疏化的注意力矩阵来加速训练。 + +## 局部敏感哈希注意力机制(LSH attention) + +[Reformer](model_doc/reformer)使用LSH(局部敏感哈希)的注意力机制。在计算softmax(QK^t)时,只有矩阵QK^t中的最大元素(在softmax维度上)会做出有用的贡献。所以对于Q中的每个查询q,我们只需要考虑K中与q接近的键k,这里使用了一个哈希函数来确定q和k是否接近。注意力掩码被修改以掩盖当前的词符(token)(除了第一个位置之外),因为这样会使得查询和键相等(因此非常相似)。由于哈希可能会有些随机性,所以在实践中使用多个哈希函数(由n_rounds参数确定),然后一起求平均。 + +## 局部注意力机制(Local attention) +[Longformer](model_doc/longformer)使用局部注意力机制:通常情况下,局部上下文(例如,左边和右边的两个词符是什么?)对于给定词符的操作已经足够了。此外,通过堆叠具有小窗口的注意力层,最后一层将拥有不仅仅是窗口内词符的感受野,这使得它们能构建整个句子的表示。 + +一些预先选定的输入词符也被赋予全局注意力:对于这些少数词符,注意力矩阵可以访问所有词符(tokens),并且这个过程是对称的:所有其他词符除了它们局部窗口内的词符之外,也可以访问这些特定的词符。这在论文的图2d中有展示,下面是一个样本注意力掩码: + +
+ +
+ +使用参数更少的注意力矩阵,可以让模型处理更长的输入序列。 + +## 其他技巧 + +### 轴向位置编码 + +[Reformer](model_doc/reformer)模型使用轴向位置编码:在传统的transformer模型中,位置编码矩阵E的大小是\\(l\\)乘以\\(d\\),其中\\(l\\)是序列长度,\\(d\\)是隐藏状态的维度。如果你有非常长的文本,这个矩阵可能会非常大,将会占用大量的GPU显存。为了缓解这个问题,轴向位置编码将这个大矩阵E分解成两个较小的矩阵E1和E2,它们的维度分别是\\(l_{1} \times d_{1}\\) 和\\(l_{2} \times d_{2}\\),满足\\(l_{1} \times l_{2} = l\\)和\\(d_{1} + d_{2} = d\\)(通过长度的乘积,最终得到的矩阵要小得多)。在E中,对于时间步\\(j\\) 的嵌入是通过连接E1中时间步 \\(j \% l1\\) 的嵌入和E2中时间步\\(j // l1\\)的嵌入来获得的。 + diff --git a/docs/source/zh/bertology.md b/docs/source/zh/bertology.md new file mode 100644 index 00000000000000..9b39f948339474 --- /dev/null +++ b/docs/source/zh/bertology.md @@ -0,0 +1,33 @@ + + +# 基于BERT进行的相关研究(BERTology) + +当前,一个新兴的研究领域正致力于探索大规模 transformer 模型(如BERT)的内部工作机制,一些人称之为“BERTology”。以下是这个领域的一些典型示例: + + +- BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: + https://arxiv.org/abs/1905.05950 +- Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650 +- What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. + Manning: https://arxiv.org/abs/1906.04341 +- CAT-probing: A Metric-based Approach to Interpret How Pre-trained Models for Programming Language Attend Code Structure: https://arxiv.org/abs/2210.04633 + + +为了助力这一新兴领域的发展,我们在BERT/GPT/GPT-2模型中增加了一些附加功能,方便人们访问其内部表示,这些功能主要借鉴了Paul Michel的杰出工作(https://arxiv.org/abs/1905.10650): + + +- 访问BERT/GPT/GPT-2的所有隐藏状态, +- 访问BERT/GPT/GPT-2每个注意力头的所有注意力权重, +- 检索注意力头的输出值和梯度,以便计算头的重要性得分并对头进行剪枝,详情可见论文:https://arxiv.org/abs/1905.10650。 + +为了帮助您理解和使用这些功能,我们添加了一个具体的示例脚本:[bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py),该脚本可以对一个在 GLUE 数据集上预训练的模型进行信息提取与剪枝。 \ No newline at end of file diff --git a/docs/source/zh/community.md b/docs/source/zh/community.md new file mode 100644 index 00000000000000..d9dd3938ed2877 --- /dev/null +++ b/docs/source/zh/community.md @@ -0,0 +1,69 @@ + + +# 社区 + +这个页面汇集了社区开发的🤗Transformers相关的资源。 + +## 社区资源 + +| 资源 | 描述 | 作者 | +|:----------|:-------------|------:| +| [Hugging Face Transformers Glossary Flashcards](https://www.darigovresearch.com/huggingface-transformers-glossary-flashcards) | 这是一套基于 [Transformers文档术语表](glossary) 的抽认卡,它们已被整理成可以通过 [Anki](https://apps.ankiweb.net/) (一款专为长期知识保留而设计的开源、跨平台的应用)来进行学习和复习的形式。使用方法参见: [介绍如何使用抽认卡的视频](https://www.youtube.com/watch?v=Dji_h7PILrw)。 | [Darigov Research](https://www.darigovresearch.com/) | + +## 社区笔记本 + +| 笔记本 | 描述 | 作者 | | +|:----------|:-------------|:-------------|------:| +| [Fine-tune a pre-trained Transformer to generate lyrics](https://github.com/AlekseyKorshuk/huggingartists) | 如何通过微调GPT-2模型来生成你最喜欢的艺术家风格的歌词 | [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) | +| [Train T5 in Tensorflow 2](https://github.com/snapthat/TF-T5-text-to-text) | 如何使用 Tensorflow 2 训练 T5 可以完成任何任务。本笔记本演示了如何使用 SQUAD 在 Tensorflow 2 中实现问答任务 | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) | +| [Train T5 on TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb) | 如何使用 Transformers 和 Nlp 在 SQUAD 上训练 T5 | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) | +| [Fine-tune T5 for Classification and Multiple Choice](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | 如何使用 PyTorch Lightning 的text-to-text格式对 T5 进行微调以完成分类和多项选择任务 | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | +| [Fine-tune DialoGPT on New Datasets and Languages](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | 如何在新数据集上微调 DialoGPT 模型,以实现开放式对话聊天机器人 | [Nathan Cooper](https://github.com/ncoop57) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | +| [Long Sequence Modeling with Reformer](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) | 如何使用 Reformer 对长达 500,000 个 token 的序列进行训练 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) | +| [Fine-tune BART for Summarization](https://github.com/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) | 如何使用 blurr 对 BART 进行微调,以便使用 fastai 进行汇总 | [Wayde Gilliam](https://ohmeow.com/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) | +| [Fine-tune a pre-trained Transformer on anyone's tweets](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | 如何通过微调 GPT-2 模型生成以你最喜欢的 Twitter 帐户风格发布的推文 | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | +| [Optimize 🤗 Hugging Face models with Weights & Biases](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | 展示 W&B 与 Hugging Face 集成的完整教程 | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | +| [Pretrain Longformer](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) | 如何构建现有预训练模型的“长”版本 | [Iz Beltagy](https://beltagy.net) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) | +| [Fine-tune Longformer for QA](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | 如何针对问答任务微调长模型 | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | +| [Evaluate Model with 🤗nlp](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | 如何使用`nlp`库在TriviaQA数据集上评估Longformer模型| [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) | +| [Fine-tune T5 for Sentiment Span Extraction](https://github.com/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) | 如何使用PyTorch Lightning以text-to-text的格式对T5进行微调,以进行情感跨度提取 | [Lorenzo Ampil](https://github.com/enzoampil) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) | +| [Fine-tune DistilBert for Multiclass Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb) | 如何使用 PyTorch 微调 DistilBert 进行多类分类 | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb)| +|[Fine-tune BERT for Multi-label Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|如何使用 PyTorch 对 BERT 进行微调以进行多标签分类|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)| +|[Fine-tune T5 for Summarization](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|如何在 PyTorch 中微调 T5 进行总结并使用 WandB 跟踪实验|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)| +|[Speed up Fine-Tuning in Transformers with Dynamic Padding / Bucketing](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)|如何通过使用动态填充/桶排序将微调速度提高两倍|[Michael Benesty](https://github.com/pommedeterresautee) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)| +|[Pretrain Reformer for Masked Language Modeling](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| 如何训练一个带有双向自注意力层的Reformer模型 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)| +|[Expand and Fine Tune Sci-BERT](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| 如何在 CORD 数据集上增加 AllenAI 预训练的 SciBERT 模型的词汇量,并对其进行流水线化 | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)| +|[Fine Tune BlenderBotSmall for Summarization using the Trainer API](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/fine-tune-blenderbot_small-for-summarization.ipynb)| 如何使用Trainer API在自定义数据集上对BlenderBotSmall进行微调以进行文本摘要 | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Wmupuls7mykSGyRN_Qo6lPQhgp56ymq?usp=sharing)| +|[Fine-tune Electra and interpret with Integrated Gradients](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | 如何对Electra模型进行微调以进行情感分析,并使用Captum集成梯度来解释预测结果 | [Eliza Szczechla](https://elsanns.github.io) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)| +|[fine-tune a non-English GPT-2 Model with Trainer class](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | 如何使用 Trainer 类微调非英语 GPT-2 模型 | [Philipp Schmid](https://www.philschmid.de) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)| +|[Fine-tune a DistilBERT Model for Multi Label Classification task](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | 如何针对多标签分类任务微调 DistilBERT 模型 | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)| +|[Fine-tune ALBERT for sentence-pair classification](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | 如何针对句子对分类任务对 ALBERT 模型或其他基于 BERT 的模型进行微调 | [Nadir El Manouzi](https://github.com/NadirEM) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)| +|[Fine-tune Roberta for sentiment analysis](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | 如何微调 Roberta 模型进行情绪分析 | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)| +|[Evaluating Question Generation Models](https://github.com/flexudy-pipe/qugeev) | 你的 seq2seq 转换器模型生成的问题的答案有多准确? | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)| +|[Classify text with DistilBERT and Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | 如何在 TensorFlow 中微调 DistilBERT 以进行文本分类 | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)| +|[Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | 如何在CNN/Dailymail摘要任务上使用*google-bert/bert-base-uncased*检查点对*EncoderDecoderModel*进行热启动 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)| +|[Leverage RoBERTa for Encoder-Decoder Summarization on BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | 如何在BBC/XSum摘要任务上使用*FacebookAI/roberta-base*检查点对共享的*EncoderDecoderModel*进行热启动 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)| +|[Fine-tune TAPAS on Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | 如何在Sequential Question Answering (SQA)数据集上使用*tapas-base*检查点对*TapasForQuestionAnswering*进行微调 | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)| +|[Evaluate TAPAS on Table Fact Checking (TabFact)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | 如何结合使用 🤗 数据集和 🤗 transformers 库,使用*tapas-base-finetuned-tabfact*检查点评估经过微调的*TapasForSequenceClassification* | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)| +|[Fine-tuning mBART for translation](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | 如何使用 Seq2SeqTrainer 对 mBART 进行微调以实现印地语到英语的翻译 | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)| +|[Fine-tune LayoutLM on FUNSD (a form understanding dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb) | 如何在FUNSD数据集上对*LayoutLMForTokenClassification*进行微调以从扫描文档中提取信息 | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb)| +|[Fine-Tune DistilGPT2 and Generate Text](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb) | 如何微调 DistilGPT2 并生成文本 | [Aakash Tripathi](https://github.com/tripathiaakash) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb)| +|[Fine-Tune LED on up to 8K tokens](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | 如何对LED模型在PubMed数据集上进行微调以进行长文本摘要 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)| +|[Evaluate LED on Arxiv](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | 如何有效评估LED模型的长远发展 | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)| +|[Fine-tune LayoutLM on RVL-CDIP (a document image classification dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | 如何在 RVL-CDIP 数据集上微调*LayoutLMForSequenceClassification*以进行扫描文档分类 | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)| +|[Wav2Vec2 CTC decoding with GPT2 adjustment](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | 如何通过语言模型调整解码 CTC 序列 | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_z5jQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)| +|[Fine-tune BART for summarization in two languages with Trainer class](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | 如何使用Trainer类对BART模型进行多语言摘要任务的微调 | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)| +|[Evaluate Big Bird on Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | 评估BigBird模型在长文档问答任务上的性能,特别是在Trivia QA数据集上| [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)| +| [Create video captions using Wav2Vec2](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | 如何使用Wav2Vec对任何视频的音频进行转录以创建YouTube字幕 | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | +| [Fine-tune the Vision Transformer on CIFAR-10 using PyTorch Lightning](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | 如何使用HuggingFace的Transformers、Datasets和PyTorch Lightning在CIFAR-10数据集上对Vision Transformer(ViT)进行微调 | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | +| [Fine-tune the Vision Transformer on CIFAR-10 using the 🤗 Trainer](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | 如何使用HuggingFace的Transformers、Datasets和🤗 Trainer在CIFAR-10数据集上对Vision Transformer(ViT)进行微调| [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | +| [Evaluate LUKE on Open Entity, an entity typing dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | 如何在开放实体数据集上评估*LukeForEntityClassification*| [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | +| [Evaluate LUKE on TACRED, a relation extraction dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | 如何在 TACRED 数据集上评估*LukeForEntityPairClassification* | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | +| [Evaluate LUKE on CoNLL-2003, an important NER benchmark](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | 如何在 CoNLL-2003 数据集上评估*LukeForEntitySpanClassification* | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | +| [Evaluate BigBird-Pegasus on PubMed dataset](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | 如何在 PubMed 数据集上评估*BigBirdPegasusForConditionalGeneration*| [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | +| [Speech Emotion Classification with Wav2Vec2](https://github.com/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) |如何利用预训练的 Wav2Vec2 模型在 MEGA 数据集上进行情绪分类| [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | +| [Detect objects in an image with DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | 如何使用经过训练的*DetrForObjectDetection*模型检测图像中的物体并可视化注意力 | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | +| [Fine-tune DETR on a custom object detection dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | 如何在自定义对象检测数据集上微调*DetrForObjectDetection* | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | +| [Finetune T5 for Named Entity Recognition](https://github.com/ToluClassics/Notebooks/blob/main/T5_Ner_Finetuning.ipynb) | 如何在命名实体识别任务中微调*T5*| [Ogundepo Odunayo](https://github.com/ToluClassics) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing) | +| [Fine-Tuning Open-Source LLM using QLoRA with MLflow and PEFT](https://github.com/mlflow/mlflow/blob/master/docs/source/llms/transformers/tutorials/fine-tuning/transformers-peft.ipynb) | 如何使用[QLoRA](https://github.com/artidoro/qlora) 和[PEFT](https://huggingface.co/docs/peft/en/index)以内存高效的方式微调大型语言模型(LLM),同时使用 [MLflow](https://mlflow.org/docs/latest/llms/transformers/index.html)进行实验跟踪| [Yuki Watanabe](https://github.com/B-Step62) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mlflow/mlflow/blob/master/docs/source/llms/transformers/tutorials/fine-tuning/transformers-peft.ipynb) | diff --git a/docs/source/zh/installation.md b/docs/source/zh/installation.md index f87eaa5fc132cf..a9102f3393a740 100644 --- a/docs/source/zh/installation.md +++ b/docs/source/zh/installation.md @@ -157,7 +157,7 @@ conda install conda-forge::transformers 预训练模型会被下载并本地缓存到 `~/.cache/huggingface/hub`。这是由环境变量 `TRANSFORMERS_CACHE` 指定的默认目录。在 Windows 上,默认目录为 `C:\Users\username\.cache\huggingface\hub`。你可以按照不同优先级改变下述环境变量,以指定不同的缓存目录。 -1. 环境变量(默认): `HUGGINGFACE_HUB_CACHE` 或 `TRANSFORMERS_CACHE`。 +1. 环境变量(默认): `HF_HUB_CACHE` 或 `TRANSFORMERS_CACHE`。 2. 环境变量 `HF_HOME`。 3. 环境变量 `XDG_CACHE_HOME` + `/huggingface`。 diff --git a/docs/source/zh/perf_train_special.md b/docs/source/zh/perf_train_special.md new file mode 100644 index 00000000000000..ee8553475679ee --- /dev/null +++ b/docs/source/zh/perf_train_special.md @@ -0,0 +1,58 @@ + + +# 在 Apple Silicon 芯片上进行 PyTorch 训练 + +之前,在 Mac 上训练模型仅限于使用 CPU 训练。不过随着PyTorch v1.12的发布,您可以通过在 Apple Silicon 芯片的 GPU 上训练模型来显著提高性能和训练速度。这是通过将 Apple 的 Metal 性能着色器 (Metal Performance Shaders, MPS) 作为后端集成到PyTorch中实现的。[MPS后端](https://pytorch.org/docs/stable/notes/mps.html) 将 PyTorch 操作视为自定义的 Metal 着色器来实现,并将对应模块部署到`mps`设备上。 + + + +某些 PyTorch 操作目前还未在 MPS 上实现,可能会抛出错误提示。可以通过设置环境变量`PYTORCH_ENABLE_MPS_FALLBACK=1`来使用CPU内核以避免这种情况发生(您仍然会看到一个`UserWarning`)。 + +
+ +如果您遇到任何其他错误,请在[PyTorch库](https://github.com/pytorch/pytorch/issues)中创建一个 issue,因为[`Trainer`]类中只集成了 MPS 后端. + +
+ +配置好`mps`设备后,您可以: + +* 在本地训练更大的网络或更大的批量大小 +* 降低数据获取延迟,因为 GPU 的统一内存架构允许直接访问整个内存存储 +* 降低成本,因为您不需要再在云端 GPU 上训练或增加额外的本地 GPU + +在确保已安装PyTorch后就可以开始使用了。 MPS 加速支持macOS 12.3及以上版本。 + +```bash +pip install torch torchvision torchaudio +``` + +[`TrainingArguments`]类默认使用`mps`设备(如果可用)因此无需显式设置设备。例如,您可以直接运行[run_glue.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py)脚本,在无需进行任何修改的情况下自动启用 MPS 后端。 + +```diff +export TASK_NAME=mrpc + +python examples/pytorch/text-classification/run_glue.py \ + --model_name_or_path google-bert/bert-base-cased \ + --task_name $TASK_NAME \ +- --use_mps_device \ + --do_train \ + --do_eval \ + --max_seq_length 128 \ + --per_device_train_batch_size 32 \ + --learning_rate 2e-5 \ + --num_train_epochs 3 \ + --output_dir /tmp/$TASK_NAME/ \ + --overwrite_output_dir +``` + +用于[分布式设置](https://pytorch.org/docs/stable/distributed.html#backends)的后端(如`gloo`和`nccl`)不支持`mps`设备,这也意味着使用 MPS 后端时只能在单个 GPU 上进行训练。 + +您可以在[Introducing Accelerated PyTorch Training on Mac](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/)博客文章中了解有关 MPS 后端的更多信息。 diff --git a/docs/source/zh/tiktoken.md b/docs/source/zh/tiktoken.md new file mode 100644 index 00000000000000..c8ef6b129eccc0 --- /dev/null +++ b/docs/source/zh/tiktoken.md @@ -0,0 +1,55 @@ + + +# Transformers与Tiktonken的互操作性 + +在🤗 transformers中,当使用`from_pretrained`方法从Hub加载模型时,如果模型包含tiktoken格式的`tokenizer.model`文件,框架可以无缝支持tiktoken模型文件,并自动将其转换为我们的[快速词符化器](https://huggingface.co/docs/transformers/main/en/main_classes/tokenizer#transformers.PreTrainedTokenizerFast)。 + +### 已知包含`tiktoken.model`文件发布的模型: + - gpt2 + - llama3 + +## 使用示例 + +为了在transformers中正确加载`tiktoken`文件,请确保`tiktoken.model`文件是tiktoken格式的,并且会在加载`from_pretrained`时自动加载。以下展示如何从同一个文件中加载词符化器(tokenizer)和模型: + +```py +from transformers import AutoTokenizer + +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" +tokenizer = AutoTokenizer.from_pretrained(model_id, subfolder="original") +``` +## 创建tiktoken词符化器(tokenizer) + +`tokenizer.model`文件中不包含任何额外的词符(token)或模式字符串(pattern strings)的信息。如果这些信息很重要,需要将词符化器(tokenizer)转换为适用于[`PreTrainedTokenizerFast`]类的`tokenizer.json`格式。 + +使用[tiktoken.get_encoding](https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/registry.py#L63)生成`tokenizer.model`文件,再使用[`convert_tiktoken_to_fast`]函数将其转换为`tokenizer.json`文件。 + +```py + +from transformers.integrations.tiktoken import convert_tiktoken_to_fast +from tiktoken import get_encoding + +# You can load your custom encoding or the one provided by OpenAI +encoding = get_encoding("gpt2") +convert_tiktoken_to_fast(encoding, "config/save/dir") +``` + +生成的`tokenizer.json`文件将被保存到指定的目录,并且可以通过[`PreTrainedTokenizerFast`]类来加载。 + +```py +tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir") +``` diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py index 25a8706d869bcf..ee155e377e4137 100644 --- a/examples/flax/question-answering/run_qa.py +++ b/examples/flax/question-answering/run_qa.py @@ -61,7 +61,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") Array = Any Dataset = datasets.arrow_dataset.Dataset diff --git a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py index c0085c9f4bb88c..095af99efffccc 100644 --- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py +++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py @@ -60,7 +60,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risk. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt") diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py index 9ffbb82cd3aae6..ddbde78f703ce2 100755 --- a/examples/flax/text-classification/run_flax_glue.py +++ b/examples/flax/text-classification/run_flax_glue.py @@ -56,7 +56,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") Array = Any Dataset = datasets.arrow_dataset.Dataset diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py index 9ffaade205611c..5f1988c36de17c 100644 --- a/examples/flax/token-classification/run_flax_ner.py +++ b/examples/flax/token-classification/run_flax_ner.py @@ -57,7 +57,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/modular-transformers/configuration_my_new_model.py b/examples/modular-transformers/configuration_my_new_model.py index aa0aac55ba9191..7042c586cbb636 100644 --- a/examples/modular-transformers/configuration_my_new_model.py +++ b/examples/modular-transformers/configuration_my_new_model.py @@ -130,6 +130,16 @@ class MyNewModelConfig(PretrainedConfig): model_type = "my_new_model" keys_to_ignore_at_inference = ["past_key_values"] + # Default tensor parallel plan for base model `MyNewModelModel` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } def __init__( self, diff --git a/examples/modular-transformers/configuration_my_new_model2.py b/examples/modular-transformers/configuration_my_new_model2.py index f05ace94b62241..eddd7fe47973ef 100644 --- a/examples/modular-transformers/configuration_my_new_model2.py +++ b/examples/modular-transformers/configuration_my_new_model2.py @@ -33,6 +33,16 @@ class MyNewModel2Config(PretrainedConfig): model_type = "my_new_model2" keys_to_ignore_at_inference = ["past_key_values"] + # Default tensor parallel plan for base model `MyNewModel2Model` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } def __init__( self, diff --git a/examples/modular-transformers/image_processing_new_imgproc_model.py b/examples/modular-transformers/image_processing_new_imgproc_model.py new file mode 100644 index 00000000000000..a64eb17861a1c2 --- /dev/null +++ b/examples/modular-transformers/image_processing_new_imgproc_model.py @@ -0,0 +1,287 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from examples/modular-transformers/modular_new_imgproc_model.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_new_imgproc_model.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +from typing import Dict, List, Optional, Union + +import numpy as np +import torch + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging + + +if is_vision_available(): + import PIL + + +logger = logging.get_logger(__name__) + + +class ImgprocModelImageProcessor(BaseImageProcessor): + r""" + Constructs a IMGPROC_MODEL image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `resample` parameter in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be + overridden by the `rescale_factor` parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. Can be overridden by the `do_normalize` parameter in the `preprocess` method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be + overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 384, "width": 384} + size = get_size_dict(size, default_to_square=True) + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.do_convert_rgb = do_convert_rgb + + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + Returns: + `np.ndarray`: The resized image. + """ + size = get_size_dict(size) + if "height" not in size or "width" not in size: + raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") + output_size = (size["height"], size["width"]) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample: PILImageResampling = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + do_convert_rgb: bool = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Controls the size of the image after `resize`. The shortest edge of the image is resized to + `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image + is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest + edge equal to `int(size["shortest_edge"] * (1333 / 800))`. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to normalize the image by if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to normalize the image by if `do_normalize` is set to `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=False) + + images = make_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + # PIL RGBA images are converted to RGB + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if is_scaled_image(images[0]) and do_rescale: + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + if do_resize: + images = [ + self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_rescale: + images = [ + self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + for image in images + ] + + if do_normalize: + images = [ + self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + for image in images + ] + + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + + encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) + + return encoded_outputs + + def new_image_processing_method(self, pixel_values: torch.FloatTensor): + return pixel_values / 2 diff --git a/examples/modular-transformers/modeling_dummy.py b/examples/modular-transformers/modeling_dummy.py index ed7e3c64d7a895..0b373d4e6eab01 100644 --- a/examples/modular-transformers/modeling_dummy.py +++ b/examples/modular-transformers/modeling_dummy.py @@ -8,7 +8,6 @@ from typing import List, Optional, Tuple, Union import torch -import torch.nn.functional as F from torch import nn from ...activations import ACT2FN @@ -150,25 +149,7 @@ def __init__(self, config): self.act_fn = ACT2FN[config.hidden_act] def forward(self, x): - if self.config.pretraining_tp > 1: - slice = self.intermediate_size // self.config.pretraining_tp - gate_proj_slices = self.gate_proj.weight.split(slice, dim=0) - up_proj_slices = self.up_proj.weight.split(slice, dim=0) - down_proj_slices = self.down_proj.weight.split(slice, dim=1) - - gate_proj = torch.cat( - [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1 - ) - up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1) - - intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2) - down_proj = [ - F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp) - ] - down_proj = sum(down_proj) - else: - down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) - + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) return down_proj @@ -264,31 +245,14 @@ def forward( ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() - if self.config.pretraining_tp > 1: - key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp - query_slices = self.q_proj.weight.split( - (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0 - ) - key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) - value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) - - query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)] - query_states = torch.cat(query_states, dim=-1) - - key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)] - key_states = torch.cat(key_states, dim=-1) - - value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)] - value_states = torch.cat(value_states, dim=-1) - - else: - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used + query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) if position_embeddings is None: logger.warning_once( @@ -330,12 +294,7 @@ def forward( attn_output = attn_output.reshape(bsz, q_len, -1) - if self.config.pretraining_tp > 1: - attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) - o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1) - attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)]) - else: - attn_output = self.o_proj(attn_output) + attn_output = self.o_proj(attn_output) if not output_attentions: attn_weights = None @@ -508,9 +467,10 @@ def forward( key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used + query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) if position_embeddings is None: logger.warning_once( @@ -794,7 +754,10 @@ def __init__(self, config: DummyConfig): ) self.norm = DummyRMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.rotary_emb = DummyRotaryEmbedding(config=config) + self.gradient_checkpointing = False + if getattr(config, "pretraining_tp", 1) != 1: + logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.") # Initialize weights and apply final processing self.post_init() @@ -874,7 +837,7 @@ def forward( all_self_attns = () if output_attentions else None next_decoder_cache = None - for decoder_layer in self.layers: + for decoder_layer in self.layers[: self.config.num_hidden_layers]: if output_hidden_states: all_hidden_states += (hidden_states,) diff --git a/examples/modular-transformers/modeling_from_uppercase_model.py b/examples/modular-transformers/modeling_from_uppercase_model.py new file mode 100644 index 00000000000000..d6c16c697437d8 --- /dev/null +++ b/examples/modular-transformers/modeling_from_uppercase_model.py @@ -0,0 +1,357 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from examples/modular-transformers/modular_from_uppercase_model.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_from_uppercase_model.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +from typing import Optional, Tuple + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...pytorch_utils import is_torch_greater_or_equal_than_2_2 +from ...utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, logging +from .configuration_from_uppercase_model import FromUppercaseModelConfig + + +if is_flash_attn_2_available(): + from ...modeling_flash_attention_utils import _flash_attention_forward + + +logger = logging.get_logger(__name__) + + +class FromUppercaseModelAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + """Input shape: Batch x Time x Channel""" + + bsz, tgt_len, embed_dim = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scale + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + # apply the causal_attention_mask first + if causal_attention_mask is not None: + if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is" + f" {causal_attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if output_attentions: + # this operation is a bit akward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped + + +class FromUppercaseModelFlashAttention2(FromUppercaseModelAttention): + """ + FromUppercaseModelAttention flash attention module. This module inherits from `FromUppercaseModelAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + # Adapted from transformers.models.llama.modeling_llama.LlamaFlashAttention2.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + output_attentions = False + + batch_size, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim) + key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim) + value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim) + + dropout_rate = self.dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = _flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + is_causal=causal_attention_mask is not None, + use_top_left_mask=self._flash_attn_uses_top_left_mask, + ) + + attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim).contiguous() + attn_output = self.out_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights + + +class FromUppercaseModelSdpaAttention(FromUppercaseModelAttention): + """ + SDPA attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `FromUppercaseModelAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from FromUppercaseModelAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "FromUppercaseModelModel is using FromUppercaseModelSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not " + "support `output_attentions=True`. Falling back to the manual attention implementation, but specifying " + "the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can " + 'be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + ) + + # FROM_UPPERCASE_MODEL text model uses both `causal_attention_mask` and `attention_mask` + if attention_mask is not None and causal_attention_mask is not None: + attn_mask = attention_mask + causal_attention_mask + elif causal_attention_mask is not None: + attn_mask = causal_attention_mask + else: + attn_mask = attention_mask + + bsz, tgt_len, embed_dim = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if not is_torch_greater_or_equal_than_2_2 and query_states.device.type == "cuda" and attn_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + # FROM_UPPERCASE_MODEL text model uses both `causal_attention_mask` and `attention_mask` sequentially. + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attn_mask, + dropout_p=self.dropout if self.training else 0.0, + scale=self.scale, + ) + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, None + + +class FromUppercaseModelMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +FROM_UPPERCASE_MODEL_ATTENTION_CLASSES = { + "eager": FromUppercaseModelAttention, + "sdpa": FromUppercaseModelSdpaAttention, + "flash_attention_2": FromUppercaseModelFlashAttention2, +} + + +class FromUppercaseModelEncoderLayer(nn.Module): + def __init__(self, config: FromUppercaseModelConfig): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = FROM_UPPERCASE_MODEL_ATTENTION_CLASSES[config._attn_implementation](config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = FromUppercaseModelMLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + causal_attention_mask: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + `(config.encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + ) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs diff --git a/examples/modular-transformers/modeling_multimodal1.py b/examples/modular-transformers/modeling_multimodal1.py new file mode 100644 index 00000000000000..02876996e0e5cd --- /dev/null +++ b/examples/modular-transformers/modeling_multimodal1.py @@ -0,0 +1,1017 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from examples/modular-transformers/modular_multimodal1.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_multimodal1.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +import math +from typing import List, Optional, Tuple, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, StaticCache +from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_flash_attention_utils import FlashAttentionKwargs, _flash_attention_forward +from ...modeling_outputs import BaseModelOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_greater_or_equal_2_10, + logging, +) +from .configuration_multimodal1 import Multimodal1TextConfig + + +logger = logging.get_logger(__name__) + + +class Multimodal1TextRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Multimodal1TextRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class Multimodal1TextRotaryEmbedding(nn.Module): + def __init__( + self, + dim=None, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + rope_type="default", + config: Optional[Multimodal1TextConfig] = None, + ): + super().__init__() + # TODO (joao): remove the `if` below, only used for BC + self.rope_kwargs = {} + if config is None: + logger.warning_once( + "`Multimodal1TextRotaryEmbedding` can now be fully parameterized by passing the model config through the " + "`config` argument. All other arguments will be removed in v4.46" + ) + self.rope_kwargs = { + "rope_type": rope_type, + "factor": scaling_factor, + "dim": dim, + "base": base, + "max_position_embeddings": max_position_embeddings, + } + self.rope_type = rope_type + self.max_seq_len_cached = max_position_embeddings + self.original_max_seq_len = max_position_embeddings + else: + # BC: "rope_type" was originally "type" + if config.rope_scaling is not None: + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + def _dynamic_frequency_update(self, position_ids, device): + """ + dynamic RoPE layers should recompute `inv_freq` in the following situations: + 1 - growing beyond the cached sequence length (allow scaling) + 2 - the current sequence length is in the original scale (avoid losing precision with small sequences) + """ + seq_len = torch.max(position_ids) + 1 + if seq_len > self.max_seq_len_cached: # growth + inv_freq, self.attention_scaling = self.rope_init_fn( + self.config, device, seq_len=seq_len, **self.rope_kwargs + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation + self.max_seq_len_cached = seq_len + + if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset + self.register_buffer("inv_freq", self.original_inv_freq, persistent=False) + self.max_seq_len_cached = self.original_max_seq_len + + @torch.no_grad() + def forward(self, x, position_ids): + if "dynamic" in self.rope_type: + self._dynamic_frequency_update(position_ids, device=x.device) + + # Core RoPE block + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + # Force float32 (see https://github.com/huggingface/transformers/pull/29285) + device_type = x.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() + sin = emb.sin() + + # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention + cos = cos * self.attention_scaling + sin = sin * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +class Multimodal1TextMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class Multimodal1TextAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Multimodal1TextConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads) + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias) + + # TODO (joao): remove in v4.46 (RoPE is computed in the model, not in the decoder layers) + self.rotary_emb = Multimodal1TextRotaryEmbedding(config=self.config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used + query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + + attn_output = attn_output.reshape(bsz, q_len, -1) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class Multimodal1TextFlashAttention2(Multimodal1TextAttention): + """ + Multimodal1Text flash attention module. This module inherits from `Multimodal1TextAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if isinstance(past_key_value, StaticCache): + raise ValueError( + "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` " + "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers" + ) + + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + dropout_rate = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (Multimodal1TextRMSNorm handles it correctly) + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = _flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + position_ids=position_ids, + dropout=dropout_rate, + sliding_window=getattr(self, "sliding_window", None), + use_top_left_mask=self._flash_attn_uses_top_left_mask, + is_causal=self.is_causal, + **kwargs, + ) + + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class Multimodal1TextSdpaAttention(Multimodal1TextAttention): + """ + Multimodal1Text attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `Multimodal1TextAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from Multimodal1TextAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "Multimodal1TextModel is using Multimodal1TextSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used + query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + causal_mask = attention_mask + if attention_mask is not None: + causal_mask = causal_mask[:, :, :, : key_states.shape[-2]] + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and causal_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + is_causal = True if causal_mask is None and q_len > 1 else False + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=causal_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + is_causal=is_causal, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, -1) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +MULTIMODAL1_TEXT_ATTENTION_CLASSES = { + "eager": Multimodal1TextAttention, + "flash_attention_2": Multimodal1TextFlashAttention2, + "sdpa": Multimodal1TextSdpaAttention, +} + + +class Multimodal1TextDecoderLayer(nn.Module): + def __init__(self, config: Multimodal1TextConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = MULTIMODAL1_TEXT_ATTENTION_CLASSES[config._attn_implementation]( + config=config, layer_idx=layer_idx + ) + + self.mlp = Multimodal1TextMLP(config) + self.input_layernorm = Multimodal1TextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Multimodal1TextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence + position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. + kwargs (`dict`, *optional*): + Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code + into the model + """ + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +MULTIMODAL1_TEXT_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`Multimodal1TextConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Multimodal1Text Model outputting raw hidden-states without any specific head on top.", + MULTIMODAL1_TEXT_START_DOCSTRING, +) +class Multimodal1TextPreTrainedModel(PreTrainedModel): + config_class = Multimodal1TextConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Multimodal1TextDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + _supports_quantized_cache = True + _supports_static_cache = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +MULTIMODAL1_TEXT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. +""" + + +@add_start_docstrings( + "The bare Multimodal1Text Model outputting raw hidden-states without any specific head on top.", + MULTIMODAL1_TEXT_START_DOCSTRING, +) +class Multimodal1TextModel(Multimodal1TextPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Multimodal1TextDecoderLayer`] + + Args: + config: Multimodal1TextConfig + """ + + def __init__(self, config: Multimodal1TextConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [Multimodal1TextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = Multimodal1TextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = Multimodal1TextRotaryEmbedding(config=config) + + self.gradient_checkpointing = False + if getattr(config, "pretraining_tp", 1) != 1: + logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.") + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(MULTIMODAL1_TEXT_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **flash_attn_kwargs: Unpack[FlashAttentionKwargs], + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + hidden_states = inputs_embeds + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + causal_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + cache_position, + position_embeddings, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **flash_attn_kwargs, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + sequence_length = input_tensor.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + device=device, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to plcae the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask diff --git a/examples/modular-transformers/modeling_multimodal2.py b/examples/modular-transformers/modeling_multimodal2.py new file mode 100644 index 00000000000000..b10b11b671af95 --- /dev/null +++ b/examples/modular-transformers/modeling_multimodal2.py @@ -0,0 +1,705 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from examples/modular-transformers/modular_multimodal2.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_multimodal2.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 + +from typing import Optional, Tuple, Union + +import torch +from torch import nn + +from transformers.utils import add_start_docstrings + +from ...activations import ACT2FN +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import is_torch_greater_or_equal_than_2_2 +from ...utils import ( + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, + torch_int, +) +from .configuration_multimodal2 import Multimodal2Config, Multimodal2VisionConfig + + +if is_flash_attn_2_available(): + from ...modeling_flash_attention_utils import _flash_attention_forward + + +logger = logging.get_logger(__name__) + + +class Multimodal2VisionAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + """Input shape: Batch x Time x Channel""" + + bsz, tgt_len, embed_dim = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scale + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + # apply the causal_attention_mask first + if causal_attention_mask is not None: + if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is" + f" {causal_attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if output_attentions: + # this operation is a bit akward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped + + +class Multimodal2VisionSdpaAttention(Multimodal2VisionAttention): + """ + SDPA attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `Multimodal2VisionAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from Multimodal2VisionAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "Multimodal2VisionModel is using Multimodal2VisionSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not " + "support `output_attentions=True`. Falling back to the manual attention implementation, but specifying " + "the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can " + 'be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + ) + + # MULTIMODAL2_VISION text model uses both `causal_attention_mask` and `attention_mask` + if attention_mask is not None and causal_attention_mask is not None: + attn_mask = attention_mask + causal_attention_mask + elif causal_attention_mask is not None: + attn_mask = causal_attention_mask + else: + attn_mask = attention_mask + + bsz, tgt_len, embed_dim = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if not is_torch_greater_or_equal_than_2_2 and query_states.device.type == "cuda" and attn_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + # MULTIMODAL2_VISION text model uses both `causal_attention_mask` and `attention_mask` sequentially. + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attn_mask, + dropout_p=self.dropout if self.training else 0.0, + scale=self.scale, + ) + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, None + + +class Multimodal2VisionFlashAttention2(Multimodal2VisionAttention): + """ + Multimodal2VisionAttention flash attention module. This module inherits from `Multimodal2VisionAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + # Adapted from transformers.models.llama.modeling_llama.LlamaFlashAttention2.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + output_attentions = False + + batch_size, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim) + key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim) + value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim) + + dropout_rate = self.dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = _flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + is_causal=causal_attention_mask is not None, + use_top_left_mask=self._flash_attn_uses_top_left_mask, + ) + + attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim).contiguous() + attn_output = self.out_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights + + +class Multimodal2VisionMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +MULTIMODAL2_VISION_ATTENTION_CLASSES = { + "eager": Multimodal2VisionAttention, + "sdpa": Multimodal2VisionSdpaAttention, + "flash_attention_2": Multimodal2VisionFlashAttention2, +} + + +class Multimodal2VisionEncoderLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = MULTIMODAL2_VISION_ATTENTION_CLASSES[config._attn_implementation](config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = Multimodal2VisionMLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + causal_attention_mask: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + `(config.encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + ) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class Multimodal2VisionEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`Multimodal2VisionEncoderLayer`]. + + Args: + config: Multimodal2VisionConfig + """ + + def __init__(self, config): + super().__init__() + self.config = config + self.layers = nn.ModuleList([Multimodal2VisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Causal mask for the text model. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + causal_attention_mask, + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + causal_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class Multimodal2VisionEmbeddings(nn.Module): + def __init__(self, config: Multimodal2VisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + bias=False, + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] - 1 + position_embedding = self.position_embedding.weight.unsqueeze(0) + num_positions = position_embedding.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embedding(self.position_ids) + + class_pos_embed = position_embedding[:, :1] + patch_pos_embed = position_embedding[:, 1:] + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" f" ({self.image_size}*{self.image_size})." + ) + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + + class_embeds = self.class_embedding.expand(batch_size, 1, -1) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) + return embeddings + + +MULTIMODAL2_VISION_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using + [`AutoImageProcessor`]. See [`Multimodal2ImageProcessor.__call__`] for details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): + Whether to interpolate the pre-trained position encodings. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +class Multimodal2VisionTransformer(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + embed_dim = config.hidden_size + + self.embeddings = Multimodal2VisionEmbeddings(config) + self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + self.encoder = Multimodal2VisionEncoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + + @add_start_docstrings_to_model_forward(MULTIMODAL2_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Multimodal2VisionConfig) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = False, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + r""" + Returns: + + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + hidden_states = self.pre_layrnorm(hidden_states) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + pooled_output = last_hidden_state[:, 0, :] + pooled_output = self.post_layernorm(pooled_output) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class Multimodal2VisionPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = Multimodal2Config + base_model_prefix = "multimodal2_vision" + supports_gradient_checkpointing = True + _supports_sdpa = True + _supports_flash_attn_2 = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, Multimodal2VisionMLP): + pass + + +MULTIMODAL2_VISION_START_DOCSTRING = "doc" + + +@add_start_docstrings("New doc", MULTIMODAL2_VISION_START_DOCSTRING) +class Multimodal2VisionModel(Multimodal2VisionPreTrainedModel): + config_class = Multimodal2VisionConfig + main_input_name = "pixel_values" + _no_split_modules = ["Multimodal2VisionEncoderLayer"] + + def __init__(self, config: Multimodal2VisionConfig): + super().__init__(config) + self.vision_model = Multimodal2VisionTransformer(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.vision_model.embeddings.patch_embedding + + @add_start_docstrings_to_model_forward(MULTIMODAL2_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Multimodal2VisionConfig) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, Multimodal2VisionModel + + >>> model = Multimodal2VisionModel.from_pretrained("openai/multimodal2-vit-base-patch32") + >>> processor = AutoProcessor.from_pretrained("openai/multimodal2-vit-base-patch32") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled CLS states + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + return self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + interpolate_pos_encoding=interpolate_pos_encoding, + ) diff --git a/examples/modular-transformers/modeling_my_new_model2.py b/examples/modular-transformers/modeling_my_new_model2.py index 16f9e525a05e90..189e090094c76c 100644 --- a/examples/modular-transformers/modeling_my_new_model2.py +++ b/examples/modular-transformers/modeling_my_new_model2.py @@ -667,7 +667,10 @@ def __init__(self, config: MyNewModel2Config): [MyNewModel2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.norm = MyNewModel2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.gradient_checkpointing = False + if getattr(config, "pretraining_tp", 1) != 1: + logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.") # Initialize weights and apply final processing self.post_init() @@ -752,7 +755,7 @@ def forward( all_self_attns = () if output_attentions else None next_decoder_cache = None - for decoder_layer in self.layers: + for decoder_layer in self.layers[: self.config.num_hidden_layers]: if output_hidden_states: all_hidden_states += (hidden_states,) diff --git a/examples/modular-transformers/modeling_new_task_model.py b/examples/modular-transformers/modeling_new_task_model.py index 4556308f1ea077..d303d328e887d6 100644 --- a/examples/modular-transformers/modeling_new_task_model.py +++ b/examples/modular-transformers/modeling_new_task_model.py @@ -265,7 +265,7 @@ def _update_causal_mask( min_dtype = torch.finfo(dtype).min sequence_length = inputs_embeds.shape[1] if using_static_cache: - target_length = past_key_values.get_max_length() + target_length = past_key_values.get_max_cache_shape() else: target_length = ( attention_mask.shape[-1] @@ -358,9 +358,9 @@ def forward( ```python >>> from PIL import Image >>> import requests - >>> from transformers import AutoProcessor, NewTaskModelForConditionalGeneration + >>> from transformers import AutoProcessor, NewTaskModelForNewTask - >>> model = NewTaskModelForConditionalGeneration.from_pretrained("google/NewTaskModel-test-224px-hf") + >>> model = NewTaskModelForNewTask.from_pretrained("google/NewTaskModel-test-224px-hf") >>> processor = AutoProcessor.from_pretrained("google/NewTaskModel-test-224px-hf") >>> prompt = "answer en Where is the cow standing?" diff --git a/examples/modular-transformers/modeling_super.py b/examples/modular-transformers/modeling_super.py index 7df04bcc2a99e5..7ad606280dcc96 100644 --- a/examples/modular-transformers/modeling_super.py +++ b/examples/modular-transformers/modeling_super.py @@ -8,7 +8,6 @@ from typing import List, Optional, Tuple, Union import torch -import torch.nn.functional as F from torch import nn from ...activations import ACT2FN @@ -150,25 +149,7 @@ def __init__(self, config): self.act_fn = ACT2FN[config.hidden_act] def forward(self, x): - if self.config.pretraining_tp > 1: - slice = self.intermediate_size // self.config.pretraining_tp - gate_proj_slices = self.gate_proj.weight.split(slice, dim=0) - up_proj_slices = self.up_proj.weight.split(slice, dim=0) - down_proj_slices = self.down_proj.weight.split(slice, dim=1) - - gate_proj = torch.cat( - [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1 - ) - up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1) - - intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2) - down_proj = [ - F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp) - ] - down_proj = sum(down_proj) - else: - down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) - + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) return down_proj @@ -264,31 +245,14 @@ def forward( ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() - if self.config.pretraining_tp > 1: - key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp - query_slices = self.q_proj.weight.split( - (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0 - ) - key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) - value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) - - query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)] - query_states = torch.cat(query_states, dim=-1) - - key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)] - key_states = torch.cat(key_states, dim=-1) - - value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)] - value_states = torch.cat(value_states, dim=-1) - - else: - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used + query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) if position_embeddings is None: logger.warning_once( @@ -330,12 +294,7 @@ def forward( attn_output = attn_output.reshape(bsz, q_len, -1) - if self.config.pretraining_tp > 1: - attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) - o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1) - attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)]) - else: - attn_output = self.o_proj(attn_output) + attn_output = self.o_proj(attn_output) if not output_attentions: attn_weights = None @@ -508,9 +467,10 @@ def forward( key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used + query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) if position_embeddings is None: logger.warning_once( @@ -794,7 +754,10 @@ def __init__(self, config: SuperConfig): ) self.norm = SuperRMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.rotary_emb = SuperRotaryEmbedding(config=config) + self.gradient_checkpointing = False + if getattr(config, "pretraining_tp", 1) != 1: + logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.") # Initialize weights and apply final processing self.post_init() diff --git a/examples/modular-transformers/modular_from_uppercase_model.py b/examples/modular-transformers/modular_from_uppercase_model.py new file mode 100644 index 00000000000000..ef3044e7ee2cf0 --- /dev/null +++ b/examples/modular-transformers/modular_from_uppercase_model.py @@ -0,0 +1,6 @@ +from transformers.models.clip.modeling_clip import CLIPEncoderLayer + + +# Check if we can correctly grab dependencies with correct naming from all UPPERCASE old model +class FromUppercaseModelEncoderLayer(CLIPEncoderLayer): + pass diff --git a/examples/modular-transformers/modular_multimodal1.py b/examples/modular-transformers/modular_multimodal1.py new file mode 100644 index 00000000000000..8f8eaf91a37106 --- /dev/null +++ b/examples/modular-transformers/modular_multimodal1.py @@ -0,0 +1,6 @@ +from transformers.models.llama.modeling_llama import LlamaModel + + +# Check that we can correctly change the prefix (here add Text part at the end of the name) +class Multimodal1TextModel(LlamaModel): + pass diff --git a/examples/modular-transformers/modular_multimodal2.py b/examples/modular-transformers/modular_multimodal2.py new file mode 100644 index 00000000000000..bc11e0b2869fb9 --- /dev/null +++ b/examples/modular-transformers/modular_multimodal2.py @@ -0,0 +1,88 @@ +""" +Here, because clip is not consistent with the use of the "Text" and "Vision" prefixes, we cannot simply use +``` +class Multimodal2VisionModel(CLIPVisionModel): + pass +``` +with the hope that all dependencies will be renamed as `Multimodal2VisionClass`. For this reason, if we want consistency and +use the "Vision" part everywhere, we need to overwrite the intermediate classes and add the prefix everytime. +This adds noise to the modular, but is unfortunately unavoidable. +""" + +from torch import nn + +from transformers.models.clip.modeling_clip import ( + CLIPMLP, + CLIPAttention, + CLIPEncoder, + CLIPEncoderLayer, + CLIPFlashAttention2, + CLIPPreTrainedModel, + CLIPSdpaAttention, + CLIPVisionModel, + CLIPVisionTransformer, +) +from transformers.utils import add_start_docstrings + + +class Multimodal2VisionAttention(CLIPAttention): + pass + + +# Check that adding the second base class correctly set the parent, even though in clip it does not have the "Vision" part +class Multimodal2VisionSdpaAttention(CLIPSdpaAttention, Multimodal2VisionAttention): + pass + + +# Check that adding the second base class correctly set the parent, even though in clip it does not have the "Vision" part +class Multimodal2VisionFlashAttention2(CLIPFlashAttention2, Multimodal2VisionAttention): + pass + + +MULTIMODAL2_VISION_ATTENTION_CLASSES = { + "eager": Multimodal2VisionAttention, + "sdpa": Multimodal2VisionSdpaAttention, + "flash_attention_2": Multimodal2VisionFlashAttention2, +} + + +class Multimodal2VisionMLP(CLIPMLP): + pass + + +class Multimodal2VisionEncoderLayer(CLIPEncoderLayer): + def __init__(self, config): + super().__init__() + self.self_attn = MULTIMODAL2_VISION_ATTENTION_CLASSES[config._attn_implementation](config) + self.mlp = Multimodal2VisionMLP(config) + + +class Multimodal2VisionEncoder(CLIPEncoder): + def __init__(self, config): + super().__init__(config) + self.layers = nn.ModuleList([Multimodal2VisionEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + + +# Finally here the `Vision` part was correct in CLIP, but we still need to tell it that the encoder arg should use it as well +class Multimodal2VisionTransformer(CLIPVisionTransformer): + def __init__(self, config): + super().__init__(config) + self.encoder = Multimodal2VisionEncoder(config) + + +class Multimodal2VisionPreTrainedModel(CLIPPreTrainedModel): + def _init_weights(self, module): + if isinstance(module, Multimodal2VisionMLP): + pass + + +MULTIMODAL2_VISION_START_DOCSTRING = "doc" + + +# Here the only arg `self.vision_model = CLIPVisionTransformer(config)` in CLIPVisionModel already has the "Vision" part, so +# no need to overwrite it, it will look for `Multimodal2VisionTransformer` which has already being redefined above +# Note: we may want to redefine decorator as well for full consistency, as CLIP does not use "CLIP_VISION_START_DOCSTRING" but only +# "CLIP_START_DOCSTRING" +@add_start_docstrings("New doc", MULTIMODAL2_VISION_START_DOCSTRING) +class Multimodal2VisionModel(CLIPVisionModel, Multimodal2VisionPreTrainedModel): + _no_split_modules = ["Multimodal2VisionEncoderLayer"] diff --git a/examples/modular-transformers/modular_new_imgproc_model.py b/examples/modular-transformers/modular_new_imgproc_model.py new file mode 100644 index 00000000000000..1d054166c28d3e --- /dev/null +++ b/examples/modular-transformers/modular_new_imgproc_model.py @@ -0,0 +1,9 @@ +import torch +import torch.utils.checkpoint + +from transformers.models.blip.image_processing_blip import BlipImageProcessor + + +class ImgprocModelImageProcessor(BlipImageProcessor): + def new_image_processing_method(self, pixel_values: torch.FloatTensor): + return pixel_values / 2 diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py index cfbc4d83d93c49..ef308316569b79 100644 --- a/examples/pytorch/audio-classification/run_audio_classification.py +++ b/examples/pytorch/audio-classification/run_audio_classification.py @@ -45,7 +45,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt") diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py index 8353333ef827ed..d42fb52d5c13c0 100644 --- a/examples/pytorch/contrastive-image-text/run_clip.py +++ b/examples/pytorch/contrastive-image-text/run_clip.py @@ -54,7 +54,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt") diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index aa1cd089ef5c7f..41b843aade9c66 100755 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -56,7 +56,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index 2c60b359bd106b..6cbcac0a7e688a 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = get_logger(__name__) @@ -331,7 +331,7 @@ def main(): config = AutoConfig.from_pretrained( args.model_name_or_path, num_labels=len(labels), - i2label=id2label, + id2label=id2label, label2id=label2id, finetuning_task="image-classification", trust_remote_code=args.trust_remote_code, diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py index 90b30c60e78411..f23e55191709a5 100644 --- a/examples/pytorch/image-pretraining/run_mae.py +++ b/examples/pytorch/image-pretraining/run_mae.py @@ -43,7 +43,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py index 773038f445cc40..9d052076b7b162 100644 --- a/examples/pytorch/image-pretraining/run_mim.py +++ b/examples/pytorch/image-pretraining/run_mim.py @@ -48,7 +48,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py index 5f38481db2315c..100a1365c2e9ea 100644 --- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py +++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py @@ -53,7 +53,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/instance-segmentation/README.md b/examples/pytorch/instance-segmentation/README.md index 72eb5a5befb4fb..339d7591523de7 100644 --- a/examples/pytorch/instance-segmentation/README.md +++ b/examples/pytorch/instance-segmentation/README.md @@ -148,7 +148,7 @@ with torch.no_grad(): outputs = model(**inputs) # Post-process outputs -outputs = image_processor.post_process_instance_segmentation(outputs, target_sizes=[image.size[::-1]]) +outputs = image_processor.post_process_instance_segmentation(outputs, target_sizes=[(image.height, image.width)]) print("Mask shape: ", outputs[0]["segmentation"].shape) print("Mask values: ", outputs[0]["segmentation"].unique()) diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation.py b/examples/pytorch/instance-segmentation/run_instance_segmentation.py index 368296709f699a..806330fb72d107 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py @@ -46,7 +46,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt") diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py index d8bb9d6f235e61..d888b7853dd475 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py @@ -52,7 +52,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index d3f8ad8da97f3c..10bfee8f25f775 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -55,7 +55,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 15538b2ef2e302..078b0add065ceb 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/language-modeling/run_fim.py b/examples/pytorch/language-modeling/run_fim.py index 9d0e0008839d99..cac845f3a055df 100644 --- a/examples/pytorch/language-modeling/run_fim.py +++ b/examples/pytorch/language-modeling/run_fim.py @@ -58,7 +58,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_fim_no_trainer.py b/examples/pytorch/language-modeling/run_fim_no_trainer.py index 0af6d61107db66..0a0e10511fa236 100644 --- a/examples/pytorch/language-modeling/run_fim_no_trainer.py +++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py @@ -60,7 +60,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index 4b615fdc4cf1d5..8cb30099491a3a 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -54,7 +54,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 13a1f7a0d86231..0bff38707d567a 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = get_logger(__name__) require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index 1c2b7ecf9905ef..20763558a5f626 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py index ea6c4a0e317eec..f188e4e476a208 100755 --- a/examples/pytorch/multiple-choice/run_swag.py +++ b/examples/pytorch/multiple-choice/run_swag.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 2f6723907957f5..2d4e8bdbb92c06 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = get_logger(__name__) # You should update this to your particular problem to have better documentation of `model_type` diff --git a/examples/pytorch/object-detection/run_object_detection.py b/examples/pytorch/object-detection/run_object_detection.py index 9111874438648a..07fcb36acb1583 100644 --- a/examples/pytorch/object-detection/run_object_detection.py +++ b/examples/pytorch/object-detection/run_object_detection.py @@ -48,7 +48,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt") diff --git a/examples/pytorch/object-detection/run_object_detection_no_trainer.py b/examples/pytorch/object-detection/run_object_detection_no_trainer.py index f312d0ce8a1f1a..33ad0499301e18 100644 --- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py +++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py @@ -51,7 +51,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logging.basicConfig(level=logging.INFO) logger = get_logger(__name__) diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index 3159a79c7e5525..6de464f4367072 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py index 2fc71e0666be23..c3e12ac9edef16 100755 --- a/examples/pytorch/question-answering/run_qa_beam_search.py +++ b/examples/pytorch/question-answering/run_qa_beam_search.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index 3b7d607933c38a..8e791564b007bf 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index a8f1fc10b9c540..6ccce481b548a9 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py index b0bcb940e5186f..c1874b3fe18e12 100644 --- a/examples/pytorch/question-answering/run_seq2seq_qa.py +++ b/examples/pytorch/question-answering/run_seq2seq_qa.py @@ -46,7 +46,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py index 46f2fa45a246cc..6f77f82564172d 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py @@ -51,7 +51,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index a0ce4d0f75c639..16e64eb92343f1 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py index 78c798fd471309..9eb3498c8c174e 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py index 4d9bb7780420d8..682d3b16d216a3 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py @@ -53,7 +53,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py index aa03dacd981d6e..e6a643e4213910 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index 9c4c2ac13d44c2..a2d09f20004704 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -52,7 +52,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index 3d38e35aac59e2..ab5ab7adb19c44 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py index e7a186836fb82c..dae845b119b172 100755 --- a/examples/pytorch/text-classification/run_classification.py +++ b/examples/pytorch/text-classification/run_classification.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index 90acf81a36a408..2a99bc42e1195a 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index 7fcdf81fa861ed..8da7e86d87551f 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py index b058b6f74fdc5c..c76f83ce4def6f 100755 --- a/examples/pytorch/text-classification/run_xnli.py +++ b/examples/pytorch/text-classification/run_xnli.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index c8cb098e344b0a..056db7167280ed 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index 0646af80bdc71d..56e3a1e646db34 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index ea37b9c51e6769..dadfcb80941e27 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -52,7 +52,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index ba1f15dd83edd4..df4c1e9557a982 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/research_projects/decision_transformer/requirements.txt b/examples/research_projects/decision_transformer/requirements.txt index 6d42c3256a83e9..33bb1d658595e5 100644 --- a/examples/research_projects/decision_transformer/requirements.txt +++ b/examples/research_projects/decision_transformer/requirements.txt @@ -1,5 +1,5 @@ absl-py==1.0.0 -aiohttp==3.10.2 +aiohttp==3.10.11 aiosignal==1.2.0 alembic==1.7.7 appdirs==1.4.4 diff --git a/examples/research_projects/lxmert/requirements.txt b/examples/research_projects/lxmert/requirements.txt index ed9ecaa7bf9915..e2778663a53c72 100644 --- a/examples/research_projects/lxmert/requirements.txt +++ b/examples/research_projects/lxmert/requirements.txt @@ -86,7 +86,7 @@ testpath==0.4.4 tokenizers==0.8.1rc2 torch==2.2.0 torchvision==0.7.0 -tornado==6.4.1 +tornado==6.4.2 tqdm==4.66.3 traitlets git+https://github.com/huggingface/transformers.git diff --git a/examples/research_projects/visual_bert/requirements.txt b/examples/research_projects/visual_bert/requirements.txt index ed9ecaa7bf9915..e2778663a53c72 100644 --- a/examples/research_projects/visual_bert/requirements.txt +++ b/examples/research_projects/visual_bert/requirements.txt @@ -86,7 +86,7 @@ testpath==0.4.4 tokenizers==0.8.1rc2 torch==2.2.0 torchvision==0.7.0 -tornado==6.4.1 +tornado==6.4.2 tqdm==4.66.3 traitlets git+https://github.com/huggingface/transformers.git diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py index 20a01a46f21a59..01c31de8730b12 100644 --- a/examples/tensorflow/contrastive-image-text/run_clip.py +++ b/examples/tensorflow/contrastive-image-text/run_clip.py @@ -51,7 +51,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version( "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt" diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py index 78655e7d6bc3d5..296c70549bda1b 100644 --- a/examples/tensorflow/image-classification/run_image_classification.py +++ b/examples/tensorflow/image-classification/run_image_classification.py @@ -55,7 +55,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py index cbd4400580d9d4..b35d761d8a6a6a 100644 --- a/examples/tensorflow/multiple-choice/run_swag.py +++ b/examples/tensorflow/multiple-choice/run_swag.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py index f9c6de0e42bec2..a78a5d89e19f6c 100755 --- a/examples/tensorflow/question-answering/run_qa.py +++ b/examples/tensorflow/question-answering/run_qa.py @@ -62,7 +62,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py index 92ebd0e1d77533..92a10990d160ed 100644 --- a/examples/tensorflow/summarization/run_summarization.py +++ b/examples/tensorflow/summarization/run_summarization.py @@ -53,7 +53,7 @@ # region Checking dependencies # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py index a51939d8d58801..a8f2de825cc2e4 100644 --- a/examples/tensorflow/text-classification/run_glue.py +++ b/examples/tensorflow/text-classification/run_glue.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") task_to_keys = { "cola": ("sentence", None), diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py index 50189345d56890..1afb72cf10984e 100644 --- a/examples/tensorflow/translation/run_translation.py +++ b/examples/tensorflow/translation/run_translation.py @@ -56,7 +56,7 @@ # region Dependencies and constants # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/i18n/README_ar.md b/i18n/README_ar.md index c2dd588fdb233f..8160ec908d4411 100644 --- a/i18n/README_ar.md +++ b/i18n/README_ar.md @@ -245,7 +245,7 @@ limitations under the License. ### باستخدام pip -تم اختبار هذا المستودع على Python 3.8+، Flax 0.4.1+، PyTorch 1.11+، و TensorFlow 2.6+. +تم اختبار هذا المستودع على Python 3.9+، Flax 0.4.1+، PyTorch 1.11+، و TensorFlow 2.6+. يجب تثبيت 🤗 Transformers في [بيئة افتراضية](https://docs.python.org/3/library/venv.html). إذا كنت غير معتاد على البيئات الافتراضية Python، فراجع [دليل المستخدم](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). diff --git a/i18n/README_de.md b/i18n/README_de.md index 2532c9e12fab59..ccc9e6111a25f0 100644 --- a/i18n/README_de.md +++ b/i18n/README_de.md @@ -246,7 +246,7 @@ Das Modell selbst ist ein reguläres [PyTorch `nn.Module`](https://pytorch.org/d ### Mit pip -Dieses Repository wurde mit Python 3.8+, Flax 0.4.1+, PyTorch 1.11+ und TensorFlow 2.6+ getestet. +Dieses Repository wurde mit Python 3.9+, Flax 0.4.1+, PyTorch 1.11+ und TensorFlow 2.6+ getestet. Sie sollten 🤗 Transformers in einer [virtuellen Umgebung](https://docs.python.org/3/library/venv.html) installieren. Wenn Sie mit virtuellen Python-Umgebungen nicht vertraut sind, schauen Sie sich den [Benutzerleitfaden](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) an. diff --git a/i18n/README_es.md b/i18n/README_es.md index 6682147d7867cf..5d5ba1b3249785 100644 --- a/i18n/README_es.md +++ b/i18n/README_es.md @@ -222,7 +222,7 @@ El modelo en si es un [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.h ### Con pip -Este repositorio está probado en Python 3.8+, Flax 0.4.1+, PyTorch 1.11+ y TensorFlow 2.6+. +Este repositorio está probado en Python 3.9+, Flax 0.4.1+, PyTorch 1.11+ y TensorFlow 2.6+. Deberías instalar 🤗 Transformers en un [entorno virtual](https://docs.python.org/3/library/venv.html). Si no estas familiarizado con los entornos virtuales de Python, consulta la [guía de usuario](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). diff --git a/i18n/README_fr.md b/i18n/README_fr.md index c1eaa10edb927d..97b11166b301a1 100644 --- a/i18n/README_fr.md +++ b/i18n/README_fr.md @@ -243,7 +243,7 @@ Le modèle lui-même est un module [`nn.Module` PyTorch](https://pytorch.org/doc ### Avec pip -Ce référentiel est testé sur Python 3.8+, Flax 0.4.1+, PyTorch 1.11+ et TensorFlow 2.6+. +Ce référentiel est testé sur Python 3.9+, Flax 0.4.1+, PyTorch 1.11+ et TensorFlow 2.6+. Vous devriez installer 🤗 Transformers dans un [environnement virtuel](https://docs.python.org/3/library/venv.html). Si vous n'êtes pas familier avec les environnements virtuels Python, consultez le [guide utilisateur](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). diff --git a/i18n/README_hd.md b/i18n/README_hd.md index 07077e5dd9c37d..17efdd21eb04dc 100644 --- a/i18n/README_hd.md +++ b/i18n/README_hd.md @@ -198,7 +198,7 @@ checkpoint: जाँच बिंदु ### पिप का उपयोग करना -इस रिपॉजिटरी का परीक्षण Python 3.8+, Flax 0.4.1+, PyTorch 1.11+ और TensorFlow 2.6+ के तहत किया गया है। +इस रिपॉजिटरी का परीक्षण Python 3.9+, Flax 0.4.1+, PyTorch 1.11+ और TensorFlow 2.6+ के तहत किया गया है। आप [वर्चुअल एनवायरनमेंट](https://docs.python.org/3/library/venv.html) में 🤗 ट्रांसफॉर्मर इंस्टॉल कर सकते हैं। यदि आप अभी तक पायथन के वर्चुअल एनवायरनमेंट से परिचित नहीं हैं, तो कृपया इसे [उपयोगकर्ता निर्देश](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) पढ़ें। diff --git a/i18n/README_ja.md b/i18n/README_ja.md index 293a5ee111b0c7..3d417098ea314d 100644 --- a/i18n/README_ja.md +++ b/i18n/README_ja.md @@ -256,7 +256,7 @@ Hugging Faceチームによって作られた **[トランスフォーマーを ### pipにて -このリポジトリは、Python 3.8+, Flax 0.4.1+, PyTorch 1.11+, TensorFlow 2.6+ でテストされています。 +このリポジトリは、Python 3.9+, Flax 0.4.1+, PyTorch 1.11+, TensorFlow 2.6+ でテストされています。 🤗Transformersは[仮想環境](https://docs.python.org/3/library/venv.html)にインストールする必要があります。Pythonの仮想環境に慣れていない場合は、[ユーザーガイド](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)を確認してください。 diff --git a/i18n/README_pt-br.md b/i18n/README_pt-br.md index 79007e5aaa33f9..d9248f9a151c36 100644 --- a/i18n/README_pt-br.md +++ b/i18n/README_pt-br.md @@ -253,7 +253,7 @@ O modelo em si é um [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.ht ### Com pip -Este repositório é testado no Python 3.8+, Flax 0.4.1+, PyTorch 1.11+ e TensorFlow 2.6+. +Este repositório é testado no Python 3.9+, Flax 0.4.1+, PyTorch 1.11+ e TensorFlow 2.6+. Você deve instalar o 🤗 Transformers em um [ambiente virtual](https://docs.python.org/3/library/venv.html). Se você não está familiarizado com ambientes virtuais em Python, confira o [guia do usuário](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). diff --git a/i18n/README_ru.md b/i18n/README_ru.md index ebab1236ca8542..a359b52d2ccc73 100644 --- a/i18n/README_ru.md +++ b/i18n/README_ru.md @@ -244,7 +244,7 @@ Hugging Face Hub. Мы хотим, чтобы Transformers позволил ра ### С помощью pip -Данный репозиторий протестирован на Python 3.8+, Flax 0.4.1+, PyTorch 1.11+ и TensorFlow 2.6+. +Данный репозиторий протестирован на Python 3.9+, Flax 0.4.1+, PyTorch 1.11+ и TensorFlow 2.6+. Устанавливать 🤗 Transformers следует в [виртуальной среде](https://docs.python.org/3/library/venv.html). Если вы не знакомы с виртуальными средами Python, ознакомьтесь с [руководством пользователя](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). diff --git a/i18n/README_te.md b/i18n/README_te.md index feb537ad1a48d2..a9795e9ca326aa 100644 --- a/i18n/README_te.md +++ b/i18n/README_te.md @@ -246,7 +246,7 @@ limitations under the License. ### పిప్ తో -ఈ రిపోజిటరీ పైథాన్ 3.8+, ఫ్లాక్స్ 0.4.1+, PyTorch 1.11+ మరియు TensorFlow 2.6+లో పరీక్షించబడింది. +ఈ రిపోజిటరీ పైథాన్ 3.9+, ఫ్లాక్స్ 0.4.1+, PyTorch 1.11+ మరియు TensorFlow 2.6+లో పరీక్షించబడింది. మీరు [వర్చువల్ వాతావరణం](https://docs.python.org/3/library/venv.html)లో 🤗 ట్రాన్స్‌ఫార్మర్‌లను ఇన్‌స్టాల్ చేయాలి. మీకు పైథాన్ వర్చువల్ పరిసరాల గురించి తెలియకుంటే, [యూజర్ గైడ్](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) చూడండి. diff --git a/i18n/README_ur.md b/i18n/README_ur.md index e14c8707770791..cc37b5cfc4223d 100644 --- a/i18n/README_ur.md +++ b/i18n/README_ur.md @@ -91,7 +91,7 @@ limitations under the License. - [‏BART کے ساتھ خلاصہ کاری](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct) - [‏DistilBERT کے ساتھ سوالات کے جوابات](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species) - [‏T5 کے ساتھ ترجمہ](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin) - + کمپیوٹر وژن میں: - [‏ViT کے ساتھ امیج کی درجہ بندی](https://huggingface.co/google/vit-base-patch16-224) - [‏DETR کے ساتھ اشیاء کی شناخت](https://huggingface.co/facebook/detr-resnet-50) @@ -224,7 +224,7 @@ limitations under the License. ## مجھے Transformers کیوں استعمال کرنا چاہیے؟ ‏ 1. استعمال میں آسان جدید ترین ماڈلز: - + - قدرتی زبان کی سمجھ اور تخلیق، کمپیوٹر وژن، اور آڈیو کے کاموں میں اعلی کارکردگی۔ - معلمین اور عملی ماہرین کے لیے کم داخلی رکاوٹ۔ - سیکھنے کے لیے صرف تین کلاسز کے ساتھ چند یوزر فرینڈلی ایبسٹریکشنز۔ @@ -236,7 +236,7 @@ limitations under the License. - عملی ماہرین کمپیوٹ وقت اور پروڈکشن اخراجات کو کم کر سکتے ہیں۔ - ہر موڈیلٹی کے لیے 400,000 سے زیادہ pretrained ماڈلز کے ساتھ درجنوں آرکیٹیکچرز۔ -‏ 3. ماڈل کے لائف ٹائم کے ہر حصے کے لیے صحیح +‏ 3. ماڈل کے لائف ٹائم کے ہر حصے کے لیے صحیح فریم ورک کا انتخاب کریں: - 3 لائنز کے کوڈ میں جدید ترین ماڈلز تربیت دیں۔ @@ -259,7 +259,7 @@ limitations under the License. #### ‏ pip کے ساتھ -یہ ریپوزٹری Python 3.8+، Flax 0.4.1+، PyTorch 1.11+، اور TensorFlow 2.6+ پر ٹیسٹ کی گئی ہے۔ +یہ ریپوزٹری Python 3.9+، Flax 0.4.1+، PyTorch 1.11+، اور TensorFlow 2.6+ پر ٹیسٹ کی گئی ہے۔ آپ کو 🤗 Transformers کو ایک [ورچوئل ماحول](https://docs.python.org/3/library/venv.html) میں انسٹال کرنا چاہیے۔ اگر آپ Python ورچوئل ماحول سے واقف نہیں ہیں، تو [یوزر گائیڈ](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) دیکھیں۔ @@ -290,7 +290,7 @@ Flax، PyTorch، یا TensorFlow کو conda کے ساتھ انسٹال کرنے > **_نوٹ:_** ونڈوز پر، آپ کو کیشنگ سے فائدہ اٹھانے کے لیے ڈویلپر موڈ کو ایکٹیویٹ کرنے کا پیغام دیا جا سکتا ہے۔ اگر یہ آپ کے لیے ممکن نہیں ہے، تو براہ کرم ہمیں [اس مسئلے](https://github.com/huggingface/huggingface_hub/issues/1062) میں بتائیں۔ -### ماڈل کی تعمیرات +### ماڈل کی تعمیرات ‏ 🤗 Transformers کی طرف سے فراہم کردہ **[تمام ماڈل چیک پوائنٹس](https://huggingface.co/models)** ہگنگ فیس کے ماڈل حب [model hub](https://huggingface.co/models) سے بآسانی مربوط ہیں، جہاں یہ براہ راست [صارفین](https://huggingface.co/users) اور [تنظیموں](https://huggingface.co/organizations) کے ذریعہ اپ لوڈ کیے جاتے ہیں۔ diff --git a/i18n/README_vi.md b/i18n/README_vi.md index 5e5c2ab1e25cf7..f523c282b680c4 100644 --- a/i18n/README_vi.md +++ b/i18n/README_vi.md @@ -245,7 +245,7 @@ Chính mô hình là một [Pytorch `nn.Module`](https://pytorch.org/docs/stable ### Sử dụng pip -Thư viện này được kiểm tra trên Python 3.8+, Flax 0.4.1+, PyTorch 1.11+ và TensorFlow 2.6+. +Thư viện này được kiểm tra trên Python 3.9+, Flax 0.4.1+, PyTorch 1.11+ và TensorFlow 2.6+. Bạn nên cài đặt 🤗 Transformers trong một [môi trường ảo Python](https://docs.python.org/3/library/venv.html). Nếu bạn chưa quen với môi trường ảo Python, hãy xem [hướng dẫn sử dụng](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). diff --git a/i18n/README_zh-hans.md b/i18n/README_zh-hans.md index 61f3a19849ff55..c9ac0357f18f1b 100644 --- a/i18n/README_zh-hans.md +++ b/i18n/README_zh-hans.md @@ -198,7 +198,7 @@ checkpoint: 检查点 ### 使用 pip -这个仓库已在 Python 3.8+、Flax 0.4.1+、PyTorch 1.11+ 和 TensorFlow 2.6+ 下经过测试。 +这个仓库已在 Python 3.9+、Flax 0.4.1+、PyTorch 1.11+ 和 TensorFlow 2.6+ 下经过测试。 你可以在[虚拟环境](https://docs.python.org/3/library/venv.html)中安装 🤗 Transformers。如果你还不熟悉 Python 的虚拟环境,请阅此[用户说明](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)。 diff --git a/i18n/README_zh-hant.md b/i18n/README_zh-hant.md index e20798a2d4571f..87c623ee84a61b 100644 --- a/i18n/README_zh-hant.md +++ b/i18n/README_zh-hant.md @@ -210,7 +210,7 @@ Tokenizer 為所有的預訓練模型提供了預處理,並可以直接轉換 ### 使用 pip -這個 Repository 已在 Python 3.8+、Flax 0.4.1+、PyTorch 1.11+ 和 TensorFlow 2.6+ 下經過測試。 +這個 Repository 已在 Python 3.9+、Flax 0.4.1+、PyTorch 1.11+ 和 TensorFlow 2.6+ 下經過測試。 你可以在[虛擬環境](https://docs.python.org/3/library/venv.html)中安裝 🤗 Transformers。如果你還不熟悉 Python 的虛擬環境,請閱此[使用者指引](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)。 diff --git a/setup.py b/setup.py index 922258d65efab7..a9babfaeea67ab 100644 --- a/setup.py +++ b/setup.py @@ -179,8 +179,8 @@ "tf2onnx", "timeout-decorator", "tiktoken", - "timm<=0.9.16", - "tokenizers>=0.20,<0.21", + "timm<=1.0.11", + "tokenizers>=0.21,<0.22", "torch", "torchaudio", "torchvision", @@ -435,7 +435,7 @@ def run(self): setup( name="transformers", - version="4.47.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.48.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)", author_email="transformers@huggingface.co", description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow", diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 36cc4449aec4a2..2eaec8f1def96e 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -18,7 +18,7 @@ # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names # in the namespace without actually importing anything (and especially none of the backends). -__version__ = "4.47.0.dev0" +__version__ = "4.48.0.dev0" from typing import TYPE_CHECKING @@ -122,6 +122,7 @@ "feature_extraction_utils": ["BatchFeature", "FeatureExtractionMixin"], "file_utils": [], "generation": [ + "CompileConfig", "GenerationConfig", "TextIteratorStreamer", "TextStreamer", @@ -169,6 +170,11 @@ "AltCLIPTextConfig", "AltCLIPVisionConfig", ], + "models.aria": [ + "AriaConfig", + "AriaProcessor", + "AriaTextConfig", + ], "models.audio_spectrogram_transformer": [ "ASTConfig", "ASTFeatureExtractor", @@ -484,6 +490,7 @@ "models.idefics": ["IdeficsConfig"], "models.idefics2": ["Idefics2Config"], "models.idefics3": ["Idefics3Config"], + "models.ijepa": ["IJepaConfig"], "models.imagegpt": ["ImageGPTConfig"], "models.informer": ["InformerConfig"], "models.instructblip": [ @@ -620,7 +627,7 @@ "models.nougat": ["NougatProcessor"], "models.nystromformer": ["NystromformerConfig"], "models.olmo": ["OlmoConfig"], - "models.olmo_1124": ["Olmo1124Config"], + "models.olmo2": ["Olmo2Config"], "models.olmoe": ["OlmoeConfig"], "models.omdet_turbo": [ "OmDetTurboConfig", @@ -1174,6 +1181,7 @@ _import_structure["image_processing_base"] = ["ImageProcessingMixin"] _import_structure["image_processing_utils"] = ["BaseImageProcessor"] _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"] + _import_structure["models.aria"].extend(["AriaImageProcessor"]) _import_structure["models.beit"].extend(["BeitFeatureExtractor", "BeitImageProcessor"]) _import_structure["models.bit"].extend(["BitImageProcessor"]) _import_structure["models.blip"].extend(["BlipImageProcessor"]) @@ -1193,7 +1201,7 @@ _import_structure["models.deprecated.efficientformer"].append("EfficientFormerImageProcessor") _import_structure["models.deprecated.tvlt"].append("TvltImageProcessor") _import_structure["models.deprecated.vit_hybrid"].extend(["ViTHybridImageProcessor"]) - _import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor", "DetrImageProcessorFast"]) + _import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor"]) _import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"]) _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"]) _import_structure["models.efficientnet"].append("EfficientNetImageProcessor") @@ -1230,7 +1238,7 @@ _import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"]) _import_structure["models.pvt"].extend(["PvtImageProcessor"]) _import_structure["models.qwen2_vl"].extend(["Qwen2VLImageProcessor"]) - _import_structure["models.rt_detr"].extend(["RTDetrImageProcessor", "RTDetrImageProcessorFast"]) + _import_structure["models.rt_detr"].extend(["RTDetrImageProcessor"]) _import_structure["models.sam"].extend(["SamImageProcessor"]) _import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"]) _import_structure["models.seggpt"].extend(["SegGptImageProcessor"]) @@ -1258,6 +1266,10 @@ ] else: _import_structure["image_processing_utils_fast"] = ["BaseImageProcessorFast"] + _import_structure["models.deformable_detr"].append("DeformableDetrImageProcessorFast") + _import_structure["models.detr"].append("DetrImageProcessorFast") + _import_structure["models.pixtral"].append("PixtralImageProcessorFast") + _import_structure["models.rt_detr"].append("RTDetrImageProcessorFast") _import_structure["models.vit"].append("ViTImageProcessorFast") # PyTorch-backed objects @@ -1400,6 +1412,15 @@ "AltCLIPVisionModel", ] ) + _import_structure["models.aria"].extend( + [ + "AriaForConditionalGeneration", + "AriaPreTrainedModel", + "AriaTextForCausalLM", + "AriaTextModel", + "AriaTextPreTrainedModel", + ] + ) _import_structure["models.audio_spectrogram_transformer"].extend( [ "ASTForAudioClassification", @@ -2455,6 +2476,15 @@ "Idefics3Model", "Idefics3PreTrainedModel", "Idefics3Processor", + "Idefics3VisionConfig", + "Idefics3VisionTransformer", + ] + ) + _import_structure["models.ijepa"].extend( + [ + "IJepaForImageClassification", + "IJepaModel", + "IJepaPreTrainedModel", ] ) _import_structure["models.imagegpt"].extend( @@ -2920,11 +2950,11 @@ "OlmoPreTrainedModel", ] ) - _import_structure["models.olmo_1124"].extend( + _import_structure["models.olmo2"].extend( [ - "Olmo1124ForCausalLM", - "Olmo1124Model", - "Olmo1124PreTrainedModel", + "Olmo2ForCausalLM", + "Olmo2Model", + "Olmo2PreTrainedModel", ] ) _import_structure["models.olmoe"].extend( @@ -4977,7 +5007,7 @@ from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin # Generation - from .generation import GenerationConfig, TextIteratorStreamer, TextStreamer, WatermarkingConfig + from .generation import CompileConfig, GenerationConfig, TextIteratorStreamer, TextStreamer, WatermarkingConfig from .hf_argparser import HfArgumentParser # Integrations @@ -5020,6 +5050,11 @@ AltCLIPTextConfig, AltCLIPVisionConfig, ) + from .models.aria import ( + AriaConfig, + AriaProcessor, + AriaTextConfig, + ) from .models.audio_spectrogram_transformer import ( ASTConfig, ASTFeatureExtractor, @@ -5363,6 +5398,7 @@ ) from .models.idefics2 import Idefics2Config from .models.idefics3 import Idefics3Config + from .models.ijepa import IJepaConfig from .models.imagegpt import ImageGPTConfig from .models.informer import InformerConfig from .models.instructblip import ( @@ -5514,7 +5550,7 @@ NystromformerConfig, ) from .models.olmo import OlmoConfig - from .models.olmo_1124 import Olmo1124Config + from .models.olmo2 import Olmo2Config from .models.olmoe import OlmoeConfig from .models.omdet_turbo import ( OmDetTurboConfig, @@ -6082,6 +6118,7 @@ from .image_processing_base import ImageProcessingMixin from .image_processing_utils import BaseImageProcessor from .image_utils import ImageFeatureExtractionMixin + from .models.aria import AriaImageProcessor from .models.beit import BeitFeatureExtractor, BeitImageProcessor from .models.bit import BitImageProcessor from .models.blip import BlipImageProcessor @@ -6097,16 +6134,13 @@ ConditionalDetrImageProcessor, ) from .models.convnext import ConvNextFeatureExtractor, ConvNextImageProcessor - from .models.deformable_detr import ( - DeformableDetrFeatureExtractor, - DeformableDetrImageProcessor, - ) + from .models.deformable_detr import DeformableDetrFeatureExtractor, DeformableDetrImageProcessor from .models.deit import DeiTFeatureExtractor, DeiTImageProcessor from .models.deprecated.deta import DetaImageProcessor from .models.deprecated.efficientformer import EfficientFormerImageProcessor from .models.deprecated.tvlt import TvltImageProcessor from .models.deprecated.vit_hybrid import ViTHybridImageProcessor - from .models.detr import DetrFeatureExtractor, DetrImageProcessor, DetrImageProcessorFast + from .models.detr import DetrFeatureExtractor, DetrImageProcessor from .models.donut import DonutFeatureExtractor, DonutImageProcessor from .models.dpt import DPTFeatureExtractor, DPTImageProcessor from .models.efficientnet import EfficientNetImageProcessor @@ -6163,7 +6197,7 @@ ) from .models.pvt import PvtImageProcessor from .models.qwen2_vl import Qwen2VLImageProcessor - from .models.rt_detr import RTDetrImageProcessor, RTDetrImageProcessorFast + from .models.rt_detr import RTDetrImageProcessor from .models.sam import SamImageProcessor from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor from .models.seggpt import SegGptImageProcessor @@ -6187,6 +6221,10 @@ from .utils.dummy_torchvision_objects import * else: from .image_processing_utils_fast import BaseImageProcessorFast + from .models.deformable_detr import DeformableDetrImageProcessorFast + from .models.detr import DetrImageProcessorFast + from .models.pixtral import PixtralImageProcessorFast + from .models.rt_detr import RTDetrImageProcessorFast from .models.vit import ViTImageProcessorFast # Modeling @@ -6310,6 +6348,13 @@ AltCLIPTextModel, AltCLIPVisionModel, ) + from .models.aria import ( + AriaForConditionalGeneration, + AriaPreTrainedModel, + AriaTextForCausalLM, + AriaTextModel, + AriaTextPreTrainedModel, + ) from .models.audio_spectrogram_transformer import ( ASTForAudioClassification, ASTModel, @@ -7174,6 +7219,13 @@ Idefics3Model, Idefics3PreTrainedModel, Idefics3Processor, + Idefics3VisionConfig, + Idefics3VisionTransformer, + ) + from .models.ijepa import ( + IJepaForImageClassification, + IJepaModel, + IJepaPreTrainedModel, ) from .models.imagegpt import ( ImageGPTForCausalImageModeling, @@ -7532,10 +7584,10 @@ OlmoModel, OlmoPreTrainedModel, ) - from .models.olmo_1124 import ( - Olmo1124ForCausalLM, - Olmo1124Model, - Olmo1124PreTrainedModel, + from .models.olmo2 import ( + Olmo2ForCausalLM, + Olmo2Model, + Olmo2PreTrainedModel, ) from .models.olmoe import ( OlmoeForCausalLM, diff --git a/src/transformers/agents/agents.py b/src/transformers/agents/agents.py index c461c50f29592c..08c30d54fd43d5 100644 --- a/src/transformers/agents/agents.py +++ b/src/transformers/agents/agents.py @@ -17,7 +17,8 @@ import json import logging import re -from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union +import time +from typing import Any, Callable, Dict, List, Optional, Tuple, Union from .. import is_torch_available from ..utils import logging as transformers_logging @@ -25,6 +26,7 @@ from .agent_types import AgentAudio, AgentImage from .default_tools import BASE_PYTHON_TOOLS, FinalAnswerTool, setup_default_tools from .llm_engine import HfApiEngine, MessageRole +from .monitoring import Monitor from .prompts import ( DEFAULT_CODE_SYSTEM_PROMPT, DEFAULT_REACT_CODE_SYSTEM_PROMPT, @@ -353,17 +355,23 @@ class Agent: def __init__( self, tools: Union[List[Tool], Toolbox], - llm_engine: Callable = HfApiEngine(), - system_prompt=DEFAULT_REACT_CODE_SYSTEM_PROMPT, - tool_description_template=None, - additional_args={}, + llm_engine: Callable = None, + system_prompt: Optional[str] = None, + tool_description_template: Optional[str] = None, + additional_args: Dict = {}, max_iterations: int = 6, - tool_parser=parse_json_tool_call, + tool_parser: Optional[Callable] = None, add_base_tools: bool = False, verbose: int = 0, - grammar: Dict[str, str] = None, - managed_agents: List = None, + grammar: Optional[Dict[str, str]] = None, + managed_agents: Optional[List] = None, + step_callbacks: Optional[List[Callable]] = None, + monitor_metrics: bool = True, ): + if system_prompt is None: + system_prompt = DEFAULT_REACT_CODE_SYSTEM_PROMPT + if tool_parser is None: + tool_parser = parse_json_tool_call self.agent_name = self.__class__.__name__ self.llm_engine = llm_engine self.system_prompt_template = system_prompt @@ -406,6 +414,15 @@ def __init__( elif verbose == 2: logger.setLevel(logging.DEBUG) + # Initialize step callbacks + self.step_callbacks = step_callbacks if step_callbacks is not None else [] + + # Initialize Monitor if monitor_metrics is True + self.monitor = None + if monitor_metrics: + self.monitor = Monitor(self.llm_engine) + self.step_callbacks.append(self.monitor.update_metrics) + @property def toolbox(self) -> Toolbox: """Get the toolbox currently available to the agent""" @@ -578,13 +595,19 @@ class CodeAgent(Agent): def __init__( self, tools: List[Tool], - llm_engine: Callable = HfApiEngine(), - system_prompt: str = DEFAULT_CODE_SYSTEM_PROMPT, - tool_description_template: str = DEFAULT_TOOL_DESCRIPTION_TEMPLATE, - grammar: Dict[str, str] = None, + llm_engine: Optional[Callable] = None, + system_prompt: Optional[str] = None, + tool_description_template: Optional[str] = None, + grammar: Optional[Dict[str, str]] = None, additional_authorized_imports: Optional[List[str]] = None, **kwargs, ): + if llm_engine is None: + llm_engine = HfApiEngine() + if system_prompt is None: + system_prompt = DEFAULT_CODE_SYSTEM_PROMPT + if tool_description_template is None: + tool_description_template = DEFAULT_TOOL_DESCRIPTION_TEMPLATE super().__init__( tools=tools, llm_engine=llm_engine, @@ -700,15 +723,24 @@ class ReactAgent(Agent): def __init__( self, tools: List[Tool], - llm_engine: Callable = HfApiEngine(), - system_prompt: str = DEFAULT_REACT_CODE_SYSTEM_PROMPT, - tool_description_template: str = DEFAULT_TOOL_DESCRIPTION_TEMPLATE, - grammar: Dict[str, str] = None, - plan_type: Literal[tuple(SUPPORTED_PLAN_TYPES)] = SUPPORTED_PLAN_TYPES[0], + llm_engine: Optional[Callable] = None, + system_prompt: Optional[str] = None, + tool_description_template: Optional[str] = None, + grammar: Optional[Dict[str, str]] = None, + plan_type: Optional[str] = None, planning_interval: Optional[int] = None, **kwargs, ): - assert plan_type in SUPPORTED_PLAN_TYPES, f"plan type {plan_type} is not supported" + if llm_engine is None: + llm_engine = HfApiEngine() + if system_prompt is None: + system_prompt = DEFAULT_REACT_CODE_SYSTEM_PROMPT + if tool_description_template is None: + tool_description_template = DEFAULT_TOOL_DESCRIPTION_TEMPLATE + if plan_type is None: + plan_type = SUPPORTED_PLAN_TYPES[0] + else: + assert plan_type in SUPPORTED_PLAN_TYPES, f"plan type {plan_type} is not supported" super().__init__( tools=tools, llm_engine=llm_engine, @@ -776,16 +808,24 @@ def stream_run(self, task: str): final_answer = None iteration = 0 while final_answer is None and iteration < self.max_iterations: + step_start_time = time.time() + step_log_entry = {"iteration": iteration, "start_time": step_start_time} try: - step_logs = self.step() - if "final_answer" in step_logs: - final_answer = step_logs["final_answer"] + self.step(step_log_entry) + if "final_answer" in step_log_entry: + final_answer = step_log_entry["final_answer"] except AgentError as e: self.logger.error(e, exc_info=1) - self.logs[-1]["error"] = e + step_log_entry["error"] = e finally: + step_end_time = time.time() + step_log_entry["step_end_time"] = step_end_time + step_log_entry["step_duration"] = step_end_time - step_start_time + self.logs.append(step_log_entry) + for callback in self.step_callbacks: + callback(step_log_entry) iteration += 1 - yield self.logs[-1] + yield step_log_entry if final_answer is None and iteration == self.max_iterations: error_message = "Reached max iterations." @@ -794,6 +834,9 @@ def stream_run(self, task: str): self.logger.error(error_message, exc_info=1) final_answer = self.provide_final_answer(task) final_step_log["final_answer"] = final_answer + final_step_log["step_duration"] = 0 + for callback in self.step_callbacks: + callback(final_step_log) yield final_step_log yield final_answer @@ -805,16 +848,24 @@ def direct_run(self, task: str): final_answer = None iteration = 0 while final_answer is None and iteration < self.max_iterations: + step_start_time = time.time() + step_log_entry = {"iteration": iteration, "start_time": step_start_time} try: if self.planning_interval is not None and iteration % self.planning_interval == 0: self.planning_step(task, is_first_step=(iteration == 0), iteration=iteration) - step_logs = self.step() - if "final_answer" in step_logs: - final_answer = step_logs["final_answer"] + self.step(step_log_entry) + if "final_answer" in step_log_entry: + final_answer = step_log_entry["final_answer"] except AgentError as e: self.logger.error(e, exc_info=1) - self.logs[-1]["error"] = e + step_log_entry["error"] = e finally: + step_end_time = time.time() + step_log_entry["step_end_time"] = step_end_time + step_log_entry["step_duration"] = step_end_time - step_start_time + self.logs.append(step_log_entry) + for callback in self.step_callbacks: + callback(step_log_entry) iteration += 1 if final_answer is None and iteration == self.max_iterations: @@ -824,6 +875,9 @@ def direct_run(self, task: str): self.logger.error(error_message, exc_info=1) final_answer = self.provide_final_answer(task) final_step_log["final_answer"] = final_answer + final_step_log["step_duration"] = 0 + for callback in self.step_callbacks: + callback(final_step_log) return final_answer @@ -937,13 +991,19 @@ class ReactJsonAgent(ReactAgent): def __init__( self, tools: List[Tool], - llm_engine: Callable = HfApiEngine(), - system_prompt: str = DEFAULT_REACT_JSON_SYSTEM_PROMPT, - tool_description_template: str = DEFAULT_TOOL_DESCRIPTION_TEMPLATE, - grammar: Dict[str, str] = None, + llm_engine: Optional[Callable] = None, + system_prompt: Optional[str] = None, + tool_description_template: Optional[str] = None, + grammar: Optional[Dict[str, str]] = None, planning_interval: Optional[int] = None, **kwargs, ): + if llm_engine is None: + llm_engine = HfApiEngine() + if system_prompt is None: + system_prompt = DEFAULT_REACT_JSON_SYSTEM_PROMPT + if tool_description_template is None: + tool_description_template = DEFAULT_TOOL_DESCRIPTION_TEMPLATE super().__init__( tools=tools, llm_engine=llm_engine, @@ -954,7 +1014,7 @@ def __init__( **kwargs, ) - def step(self): + def step(self, log_entry: Dict[str, Any]): """ Perform one step in the ReAct framework: the agent thinks, acts, and observes the result. The errors are raised here, they are caught and logged in the run() method. @@ -965,9 +1025,7 @@ def step(self): self.logger.debug("===== New step =====") # Add new step in logs - current_step_logs = {} - self.logs.append(current_step_logs) - current_step_logs["agent_memory"] = agent_memory.copy() + log_entry["agent_memory"] = agent_memory.copy() self.logger.info("===== Calling LLM with this last message: =====") self.logger.info(self.prompt[-1]) @@ -981,7 +1039,7 @@ def step(self): raise AgentGenerationError(f"Error in generating llm output: {e}.") self.logger.debug("===== Output message of the LLM: =====") self.logger.debug(llm_output) - current_step_logs["llm_output"] = llm_output + log_entry["llm_output"] = llm_output # Parse self.logger.debug("===== Extracting action =====") @@ -992,8 +1050,8 @@ def step(self): except Exception as e: raise AgentParsingError(f"Could not parse the given action: {e}.") - current_step_logs["rationale"] = rationale - current_step_logs["tool_call"] = {"tool_name": tool_name, "tool_arguments": arguments} + log_entry["rationale"] = rationale + log_entry["tool_call"] = {"tool_name": tool_name, "tool_arguments": arguments} # Execute self.logger.warning("=== Agent thoughts:") @@ -1011,8 +1069,8 @@ def step(self): answer = arguments else: answer = arguments - current_step_logs["final_answer"] = answer - return current_step_logs + log_entry["final_answer"] = answer + return answer else: if arguments is None: arguments = {} @@ -1030,8 +1088,8 @@ def step(self): else: updated_information = str(observation).strip() self.logger.info(updated_information) - current_step_logs["observation"] = updated_information - return current_step_logs + log_entry["observation"] = updated_information + return log_entry class ReactCodeAgent(ReactAgent): @@ -1044,14 +1102,20 @@ class ReactCodeAgent(ReactAgent): def __init__( self, tools: List[Tool], - llm_engine: Callable = HfApiEngine(), - system_prompt: str = DEFAULT_REACT_CODE_SYSTEM_PROMPT, - tool_description_template: str = DEFAULT_TOOL_DESCRIPTION_TEMPLATE, - grammar: Dict[str, str] = None, + llm_engine: Optional[Callable] = None, + system_prompt: Optional[str] = None, + tool_description_template: Optional[str] = None, + grammar: Optional[Dict[str, str]] = None, additional_authorized_imports: Optional[List[str]] = None, planning_interval: Optional[int] = None, **kwargs, ): + if llm_engine is None: + llm_engine = HfApiEngine() + if system_prompt is None: + system_prompt = DEFAULT_REACT_CODE_SYSTEM_PROMPT + if tool_description_template is None: + tool_description_template = DEFAULT_TOOL_DESCRIPTION_TEMPLATE super().__init__( tools=tools, llm_engine=llm_engine, @@ -1075,7 +1139,7 @@ def __init__( self.system_prompt = self.system_prompt.replace("<>", str(self.authorized_imports)) self.custom_tools = {} - def step(self): + def step(self, log_entry: Dict[str, Any]): """ Perform one step in the ReAct framework: the agent thinks, acts, and observes the result. The errors are raised here, they are caught and logged in the run() method. @@ -1083,13 +1147,10 @@ def step(self): agent_memory = self.write_inner_memory_from_logs() self.prompt = agent_memory.copy() - self.logger.debug("===== New step =====") # Add new step in logs - current_step_logs = {} - self.logs.append(current_step_logs) - current_step_logs["agent_memory"] = agent_memory.copy() + log_entry["agent_memory"] = agent_memory.copy() self.logger.info("===== Calling LLM with these last messages: =====") self.logger.info(self.prompt[-2:]) @@ -1104,7 +1165,7 @@ def step(self): self.logger.debug("=== Output message of the LLM:") self.logger.debug(llm_output) - current_step_logs["llm_output"] = llm_output + log_entry["llm_output"] = llm_output # Parse self.logger.debug("=== Extracting action ===") @@ -1120,8 +1181,8 @@ def step(self): error_msg = f"Error in code parsing: {e}. Make sure to provide correct code" raise AgentParsingError(error_msg) - current_step_logs["rationale"] = rationale - current_step_logs["tool_call"] = {"tool_name": "code interpreter", "tool_arguments": code_action} + log_entry["rationale"] = rationale + log_entry["tool_call"] = {"tool_name": "code interpreter", "tool_arguments": code_action} # Execute self.log_rationale_code_action(rationale, code_action) @@ -1146,7 +1207,7 @@ def step(self): self.logger.warning("Last output from code snippet:") self.logger.log(32, str(result)) observation += "Last output from code snippet:\n" + str(result)[:100000] - current_step_logs["observation"] = observation + log_entry["observation"] = observation except Exception as e: error_msg = f"Code execution failed due to the following error:\n{str(e)}" if "'dict' object has no attribute 'read'" in str(e): @@ -1156,8 +1217,11 @@ def step(self): if line[: len("final_answer")] == "final_answer": self.logger.log(33, "Final answer:") self.logger.log(32, result) - current_step_logs["final_answer"] = result - return current_step_logs + log_entry["final_answer"] = result + return result + + +LENGTH_TRUNCATE_REPORTS = 1000 class ManagedAgent: @@ -1200,10 +1264,14 @@ def __call__(self, request, **kwargs): answer += f"\n\nFor more detail, find below a summary of this agent's work:\nSUMMARY OF WORK FROM AGENT '{self.name}':\n" for message in self.agent.write_inner_memory_from_logs(summary_mode=True): content = message["content"] - if len(str(content)) < 1000 or "[FACTS LIST]" in str(content): + if len(str(content)) < LENGTH_TRUNCATE_REPORTS or "[FACTS LIST]" in str(content): answer += "\n" + str(content) + "\n---" else: - answer += "\n" + str(content)[:1000] + "\n(...Step was truncated because too long)...\n---" + answer += ( + "\n" + + str(content)[:LENGTH_TRUNCATE_REPORTS] + + "\n(...Step was truncated because too long)...\n---" + ) answer += f"\nEND OF SUMMARY OF WORK FROM AGENT '{self.name}'." return answer else: diff --git a/src/transformers/agents/llm_engine.py b/src/transformers/agents/llm_engine.py index 456c6172a77cb0..afa4d62d059e5b 100644 --- a/src/transformers/agents/llm_engine.py +++ b/src/transformers/agents/llm_engine.py @@ -20,7 +20,12 @@ from huggingface_hub import InferenceClient +from .. import AutoTokenizer from ..pipelines.base import Pipeline +from ..utils import logging + + +logger = logging.get_logger(__name__) class MessageRole(str, Enum): @@ -67,46 +72,32 @@ def get_clean_message_list(message_list: List[Dict[str, str]], role_conversions: } -class HfApiEngine: - """A class to interact with Hugging Face's Inference API for language model interaction. - - This engine allows you to communicate with Hugging Face's models using the Inference API. It can be used in both serverless mode or with a dedicated endpoint, supporting features like stop sequences and grammar customization. - - Parameters: - model (`str`, *optional*, defaults to `"meta-llama/Meta-Llama-3.1-8B-Instruct"`): - The Hugging Face model ID to be used for inference. This can be a path or model identifier from the Hugging Face model hub. - token (`str`, *optional*): - The Hugging Face API token for authentication. If not provided, the class will use the token stored in the Hugging Face CLI configuration. - max_tokens (`int`, *optional*, defaults to 1500): - The maximum number of tokens allowed in the output. - timeout (`int`, *optional*, defaults to 120): - Timeout for the API request, in seconds. - - Raises: - ValueError: - If the model name is not provided. - """ - - def __init__( - self, - model: str = "meta-llama/Meta-Llama-3.1-8B-Instruct", - token: Optional[str] = None, - max_tokens: Optional[int] = 1500, - timeout: Optional[int] = 120, +class HfEngine: + def __init__(self, model_id: Optional[str] = None): + self.last_input_token_count = None + self.last_output_token_count = None + if model_id is None: + model_id = "HuggingFaceTB/SmolLM2-1.7B-Instruct" + logger.warning(f"Using default model for token counting: '{model_id}'") + try: + self.tokenizer = AutoTokenizer.from_pretrained(model_id) + except Exception as e: + logger.warning(f"Failed to load tokenizer for model {model_id}: {e}. Loading default tokenizer instead.") + self.tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct") + + def get_token_counts(self): + return { + "input_token_count": self.last_input_token_count, + "output_token_count": self.last_output_token_count, + } + + def generate( + self, messages: List[Dict[str, str]], stop_sequences: Optional[List[str]] = None, grammar: Optional[str] = None ): - """Initialize the HfApiEngine.""" - if not model: - raise ValueError("Model name must be provided.") - - self.model = model - self.client = InferenceClient(self.model, token=token, timeout=timeout) - self.max_tokens = max_tokens + raise NotImplementedError def __call__( - self, - messages: List[Dict[str, str]], - stop_sequences: Optional[List[str]] = [], - grammar: Optional[str] = None, + self, messages: List[Dict[str, str]], stop_sequences: Optional[List[str]] = None, grammar: Optional[str] = None ) -> str: """Process the input messages and return the model's response. @@ -136,6 +127,57 @@ def __call__( "Quantum mechanics is the branch of physics that studies..." ``` """ + if not isinstance(messages, List): + raise ValueError("Messages should be a list of dictionaries with 'role' and 'content' keys.") + if stop_sequences is None: + stop_sequences = [] + response = self.generate(messages, stop_sequences, grammar) + self.last_input_token_count = len(self.tokenizer.apply_chat_template(messages, tokenize=True)) + self.last_output_token_count = len(self.tokenizer.encode(response)) + + # Remove stop sequences from LLM output + for stop_seq in stop_sequences: + if response[-len(stop_seq) :] == stop_seq: + response = response[: -len(stop_seq)] + return response + + +class HfApiEngine(HfEngine): + """A class to interact with Hugging Face's Inference API for language model interaction. + + This engine allows you to communicate with Hugging Face's models using the Inference API. It can be used in both serverless mode or with a dedicated endpoint, supporting features like stop sequences and grammar customization. + + Parameters: + model (`str`, *optional*, defaults to `"meta-llama/Meta-Llama-3.1-8B-Instruct"`): + The Hugging Face model ID to be used for inference. This can be a path or model identifier from the Hugging Face model hub. + token (`str`, *optional*): + Token used by the Hugging Face API for authentication. + If not provided, the class will use the token stored in the Hugging Face CLI configuration. + max_tokens (`int`, *optional*, defaults to 1500): + The maximum number of tokens allowed in the output. + timeout (`int`, *optional*, defaults to 120): + Timeout for the API request, in seconds. + + Raises: + ValueError: + If the model name is not provided. + """ + + def __init__( + self, + model: str = "meta-llama/Meta-Llama-3.1-8B-Instruct", + token: Optional[str] = None, + max_tokens: Optional[int] = 1500, + timeout: Optional[int] = 120, + ): + super().__init__(model_id=model) + self.model = model + self.client = InferenceClient(self.model, token=token, timeout=timeout) + self.max_tokens = max_tokens + + def generate( + self, messages: List[Dict[str, str]], stop_sequences: Optional[List[str]] = None, grammar: Optional[str] = None + ) -> str: # Get clean message list messages = get_clean_message_list(messages, role_conversions=llama_role_conversions) @@ -148,41 +190,40 @@ def __call__( response = self.client.chat_completion(messages, stop=stop_sequences, max_tokens=self.max_tokens) response = response.choices[0].message.content - - # Remove stop sequences from LLM output - for stop_seq in stop_sequences: - if response[-len(stop_seq) :] == stop_seq: - response = response[: -len(stop_seq)] return response -class TransformersEngine: +class TransformersEngine(HfEngine): """This engine uses a pre-initialized local text-generation pipeline.""" - def __init__(self, pipeline: Pipeline): + def __init__(self, pipeline: Pipeline, model_id: Optional[str] = None): + super().__init__(model_id) self.pipeline = pipeline - def __call__( - self, messages: List[Dict[str, str]], stop_sequences: Optional[List[str]] = None, grammar: Optional[str] = None + def generate( + self, + messages: List[Dict[str, str]], + stop_sequences: Optional[List[str]] = None, + grammar: Optional[str] = None, + max_length: int = 1500, ) -> str: # Get clean message list messages = get_clean_message_list(messages, role_conversions=llama_role_conversions) # Get LLM output + if stop_sequences is not None and len(stop_sequences) > 0: + stop_strings = stop_sequences + else: + stop_strings = None + output = self.pipeline( messages, - stop_strings=stop_sequences, - max_length=1500, + stop_strings=stop_strings, + max_length=max_length, tokenizer=self.pipeline.tokenizer, ) response = output[0]["generated_text"][-1]["content"] - - # Remove stop sequences from LLM output - if stop_sequences is not None: - for stop_seq in stop_sequences: - if response[-len(stop_seq) :] == stop_seq: - response = response[: -len(stop_seq)] return response diff --git a/src/transformers/agents/monitoring.py b/src/transformers/agents/monitoring.py index 755418d35a56a3..7126e72b5fd060 100644 --- a/src/transformers/agents/monitoring.py +++ b/src/transformers/agents/monitoring.py @@ -14,8 +14,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from ..utils import logging from .agent_types import AgentAudio, AgentImage, AgentText -from .agents import ReactAgent + + +logger = logging.get_logger(__name__) def pull_message(step_log: dict, test_mode: bool = True): @@ -54,7 +57,7 @@ def __init__(self, role, content, metadata=None): ) -def stream_to_gradio(agent: ReactAgent, task: str, test_mode: bool = False, **kwargs): +def stream_to_gradio(agent, task: str, test_mode: bool = False, **kwargs): """Runs an agent with the given task and streams the messages from the agent as gradio ChatMessages.""" try: @@ -91,3 +94,24 @@ def __init__(self, role, content, metadata=None): ) else: yield ChatMessage(role="assistant", content=str(final_answer)) + + +class Monitor: + def __init__(self, tracked_llm_engine): + self.step_durations = [] + self.tracked_llm_engine = tracked_llm_engine + if getattr(self.tracked_llm_engine, "last_input_token_count", "Not found") != "Not found": + self.total_input_token_count = 0 + self.total_output_token_count = 0 + + def update_metrics(self, step_log): + step_duration = step_log["step_duration"] + self.step_durations.append(step_duration) + logger.info(f"Step {len(self.step_durations)}:") + logger.info(f"- Time taken: {step_duration:.2f} seconds (valid only if step succeeded)") + + if getattr(self.tracked_llm_engine, "last_input_token_count", None) is not None: + self.total_input_token_count += self.tracked_llm_engine.last_input_token_count + self.total_output_token_count += self.tracked_llm_engine.last_output_token_count + logger.info(f"- Input tokens: {self.total_input_token_count}") + logger.info(f"- Output tokens: {self.total_output_token_count}") diff --git a/src/transformers/agents/prompts.py b/src/transformers/agents/prompts.py index 7a84b1db44faba..898a7e011a2b05 100644 --- a/src/transformers/agents/prompts.py +++ b/src/transformers/agents/prompts.py @@ -129,7 +129,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"): ``` --- -Above example were using tools that might not exist for you. You only have acces to those Tools: +Above example were using tools that might not exist for you. You only have access to these Tools: <> Remember to make sure that variables you use are all defined. @@ -256,7 +256,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"): } -Above example were using notional tools that might not exist for you. You only have acces to those tools: +Above example were using notional tools that might not exist for you. You only have access to these tools: <> Here are the rules you should always follow to solve your task: @@ -348,7 +348,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"): final_answer(pope_current_age) ``` -Above example were using notional tools that might not exist for you. On top of performing computations in the Python code snippets that you create, you have acces to those tools (and no other tool): +Above example were using notional tools that might not exist for you. On top of performing computations in the Python code snippets that you create, you have access to these tools (and no other tool): <> diff --git a/src/transformers/agents/search.py b/src/transformers/agents/search.py index f50a7c6ab8f94e..1c2c33913c97fb 100644 --- a/src/transformers/agents/search.py +++ b/src/transformers/agents/search.py @@ -42,7 +42,7 @@ def forward(self, query: str) -> str: class VisitWebpageTool(Tool): name = "visit_webpage" - description = "Visits a wbepage at the given url and returns its content as a markdown string." + description = "Visits a webpage at the given url and returns its content as a markdown string." inputs = { "url": { "type": "string", diff --git a/src/transformers/agents/tools.py b/src/transformers/agents/tools.py index 84bcf0fde61f18..759704612c2f4f 100644 --- a/src/transformers/agents/tools.py +++ b/src/transformers/agents/tools.py @@ -23,6 +23,7 @@ import os import tempfile from functools import lru_cache, wraps +from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Union from huggingface_hub import create_repo, get_collection, hf_hub_download, metadata_update, upload_folder @@ -45,7 +46,7 @@ is_vision_available, logging, ) -from .agent_types import handle_agent_inputs, handle_agent_outputs +from .agent_types import ImageType, handle_agent_inputs, handle_agent_outputs logger = logging.get_logger(__name__) @@ -386,7 +387,7 @@ def push_to_hub( commit_message (`str`, *optional*, defaults to `"Upload tool"`): Message to commit while pushing. private (`bool`, *optional*): - Whether or not the repository created should be private. + Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists. token (`bool` or `str`, *optional*): The token to use as HTTP bearer authorization for remote files. If unset, will use the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). @@ -418,7 +419,9 @@ def push_to_hub( ) @staticmethod - def from_space(space_id, name, description): + def from_space( + space_id: str, name: str, description: str, api_name: Optional[str] = None, token: Optional[str] = None + ): """ Creates a [`Tool`] from a Space given its id on the Hub. @@ -429,34 +432,73 @@ def from_space(space_id, name, description): The name of the tool. description (`str`): The description of the tool. - + api_name (`str`, *optional*): + The specific api_name to use, if the space has several tabs. If not precised, will default to the first available api. + token (`str`, *optional*): + Add your token to access private spaces or increase your GPU quotas. Returns: [`Tool`]: - The created tool. + The Space, as a tool. - Example: + Examples: + ``` + image_generator = Tool.from_space( + space_id="black-forest-labs/FLUX.1-schnell", + name="image-generator", + description="Generate an image from a prompt" + ) + image = image_generator("Generate an image of a cool surfer in Tahiti") + ``` ``` - tool = Tool.from_space("black-forest-labs/FLUX.1-schnell", "image-generator", "Generate an image from a prompt") + face_swapper = Tool.from_space( + "tuan2308/face-swap", + "face_swapper", + "Tool that puts the face shown on the first image on the second image. You can give it paths to images.", + ) + image = face_swapper('./aymeric.jpeg', './ruth.jpg') ``` """ - from gradio_client import Client + from gradio_client import Client, handle_file + from gradio_client.utils import is_http_url_like class SpaceToolWrapper(Tool): - def __init__(self, space_id, name, description): - self.client = Client(space_id) + def __init__( + self, + space_id: str, + name: str, + description: str, + api_name: Optional[str] = None, + token: Optional[str] = None, + ): + self.client = Client(space_id, hf_token=token) self.name = name self.description = description - space_description = self.client.view_api(return_format="dict")["named_endpoints"] - route = list(space_description.keys())[0] - space_description_route = space_description[route] + space_description = self.client.view_api(return_format="dict", print_info=False)["named_endpoints"] + + # If api_name is not defined, take the first of the available APIs for this space + if api_name is None: + api_name = list(space_description.keys())[0] + logger.warning( + f"Since `api_name` was not defined, it was automatically set to the first avilable API: `{api_name}`." + ) + self.api_name = api_name + + try: + space_description_api = space_description[api_name] + except KeyError: + raise KeyError(f"Could not find specified {api_name=} among available api names.") + self.inputs = {} - for parameter in space_description_route["parameters"]: + for parameter in space_description_api["parameters"]: if not parameter["parameter_has_default"]: + parameter_type = parameter["type"]["type"] + if parameter_type == "object": + parameter_type = "any" self.inputs[parameter["parameter_name"]] = { - "type": parameter["type"]["type"], + "type": parameter_type, "description": parameter["python_type"]["description"], } - output_component = space_description_route["returns"][0]["component"] + output_component = space_description_api["returns"][0]["component"] if output_component == "Image": self.output_type = "image" elif output_component == "Audio": @@ -464,10 +506,33 @@ def __init__(self, space_id, name, description): else: self.output_type = "any" - def forward(self, *args, **kwargs): - return self.client.predict(*args, **kwargs)[0] # Usually the first output is the result + def sanitize_argument_for_prediction(self, arg): + if isinstance(arg, ImageType): + temp_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False) + arg.save(temp_file.name) + arg = temp_file.name + if (isinstance(arg, (str, Path)) and Path(arg).exists() and Path(arg).is_file()) or is_http_url_like( + arg + ): + arg = handle_file(arg) + return arg - return SpaceToolWrapper(space_id, name, description) + def forward(self, *args, **kwargs): + # Preprocess args and kwargs: + args = list(args) + for i, arg in enumerate(args): + args[i] = self.sanitize_argument_for_prediction(arg) + for arg_name, arg in kwargs.items(): + kwargs[arg_name] = self.sanitize_argument_for_prediction(arg) + + output = self.client.predict(*args, api_name=self.api_name, **kwargs) + if isinstance(output, tuple) or isinstance(output, list): + return output[ + 0 + ] # Sometime the space also returns the generation seed, in which case the result is at index 0 + return output + + return SpaceToolWrapper(space_id, name, description, api_name=api_name, token=token) @staticmethod def from_gradio(gradio_tool): @@ -720,21 +785,22 @@ def launch_gradio_demo(tool_class: Tool): def fn(*args, **kwargs): return tool(*args, **kwargs) + TYPE_TO_COMPONENT_CLASS_MAPPING = { + "image": gr.Image, + "audio": gr.Audio, + "string": gr.Textbox, + "integer": gr.Textbox, + "number": gr.Textbox, + } + gradio_inputs = [] for input_name, input_details in tool_class.inputs.items(): - input_type = input_details["type"] - if input_type == "image": - gradio_inputs.append(gr.Image(label=input_name)) - elif input_type == "audio": - gradio_inputs.append(gr.Audio(label=input_name)) - elif input_type in ["string", "integer", "number"]: - gradio_inputs.append(gr.Textbox(label=input_name)) - else: - error_message = f"Input type '{input_type}' not supported." - raise ValueError(error_message) + input_gradio_component_class = TYPE_TO_COMPONENT_CLASS_MAPPING[input_details["type"]] + new_component = input_gradio_component_class(label=input_name) + gradio_inputs.append(new_component) - gradio_output = tool_class.output_type - assert gradio_output in ["string", "image", "audio"], f"Output type '{gradio_output}' not supported." + output_gradio_componentclass = TYPE_TO_COMPONENT_CLASS_MAPPING[tool_class.output_type] + gradio_output = output_gradio_componentclass(label=input_name) gr.Interface( fn=fn, diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py index 0f696cc3ac6a4d..23f2177b25d529 100644 --- a/src/transformers/cache_utils.py +++ b/src/transformers/cache_utils.py @@ -12,7 +12,6 @@ from .utils import ( is_hqq_available, is_optimum_quanto_available, - is_quanto_available, is_torchdynamo_compiling, logging, ) @@ -433,19 +432,22 @@ def update( self._seen_tokens += key_states.shape[-2] # Update the cache - if len(self.key_cache) <= layer_idx: - # There may be skipped layers, fill them with empty lists - for _ in range(len(self.key_cache), layer_idx): - self.key_cache.append([]) - self.value_cache.append([]) - self.key_cache.append(key_states) - self.value_cache.append(value_states) - elif len(self.key_cache[layer_idx]) == 0: # fills previously skipped layers; checking for tensor causes errors - self.key_cache[layer_idx] = key_states - self.value_cache[layer_idx] = value_states - else: - self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2) - self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2) + if key_states is not None: + if len(self.key_cache) <= layer_idx: + # There may be skipped layers, fill them with empty lists + for _ in range(len(self.key_cache), layer_idx): + self.key_cache.append([]) + self.value_cache.append([]) + self.key_cache.append(key_states) + self.value_cache.append(value_states) + elif ( + len(self.key_cache[layer_idx]) == 0 + ): # fills previously skipped layers; checking for tensor causes errors + self.key_cache[layer_idx] = key_states + self.value_cache[layer_idx] = value_states + else: + self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2) + self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2) return self.key_cache[layer_idx], self.value_cache[layer_idx] @@ -525,7 +527,7 @@ def from_batch_splits(cls, splits: List["DynamicCache"], num_hidden_layers: int cache = cls() for idx in range(len(splits[0])): key_cache = [current.key_cache[idx] for current in splits if current.key_cache[idx] != []] - value_cache = [current.key_cache[idx] for current in splits if current.key_cache[idx] != []] + value_cache = [current.value_cache[idx] for current in splits if current.value_cache[idx] != []] if key_cache != []: layer_keys = torch.cat(key_cache, dim=0) layer_values = torch.cat(value_cache, dim=0) @@ -781,18 +783,12 @@ def __init__(self, cache_config: CacheConfig) -> None: super().__init__(cache_config) if is_optimum_quanto_available(): - from optimum.quanto import MaxOptimizer, qint2, qint4 - elif is_quanto_available(): - logger.warning_once( - "Importing from quanto will be deprecated in v4.47. Please install optimum-quanto instead `pip install optimum-quanto`" - ) - quanto_version = version.parse(importlib.metadata.version("quanto")) - if quanto_version < version.parse("0.2.0"): + optimum_quanto_version = version.parse(importlib.metadata.version("optimum-quanto")) + if optimum_quanto_version <= version.parse("0.2.5"): raise ImportError( - f"You need quanto package version to be greater or equal than 0.2.0 to use `QuantoQuantizedCache`. Detected version {quanto_version}. " - f"Since quanto will be deprecated, please install optimum-quanto instead with `pip install -U optimum-quanto`" + f"You need optimum-quanto package version to be greater or equal than 0.2.5 to use `QuantoQuantizedCache`. Detected version {optimum_quanto_version}." ) - from quanto import MaxOptimizer, qint2, qint4 + from optimum.quanto import MaxOptimizer, qint2, qint4 if self.nbits not in [2, 4]: raise ValueError(f"`nbits` for `quanto` backend has to be one of [`2`, `4`] but got {self.nbits}") @@ -813,18 +809,9 @@ def _quantize(self, tensor, axis): if is_optimum_quanto_available(): from optimum.quanto import quantize_weight - qtensor = quantize_weight(tensor, self.qtype, axis, self.q_group_size) + scale, zeropoint = self.optimizer(tensor, self.qtype, axis, self.q_group_size) + qtensor = quantize_weight(tensor, self.qtype, axis, scale, zeropoint, self.q_group_size) return qtensor - elif is_quanto_available(): - logger.warning_once( - "Importing from quanto will be deprecated in v4.47. Please install optimum-quanto instead `pip install optimum-quanto`" - ) - from quanto import AffineQuantizer - - scale, zeropoint = self.optimizer(tensor, self.qtype.bits, axis, self.q_group_size) - qtensor = AffineQuantizer.apply(tensor, self.qtype, axis, self.q_group_size, scale, zeropoint) - - return qtensor def _dequantize(self, qtensor): return qtensor.dequantize() @@ -1131,13 +1118,13 @@ def __init__( layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None, ) -> None: super().__init__() - if max_batch_size is not None: + if batch_size is not None: logger.warning_once( - f"The 'max_batch_size' argument of {self.__class__.__name__} is deprecated and will be removed in " - "v4.46. Use the more precisely named 'batch_size' argument instead." + f"The 'batch_size' argument of {self.__class__.__name__} is deprecated and will be removed in " + "v4.49. Use the more precisely named 'max_batch_size' argument instead." ) - self.batch_size = batch_size or max_batch_size + self.max_batch_size = batch_size or max_batch_size self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads @@ -1208,6 +1195,8 @@ def update( k_out = self.key_cache[layer_idx] v_out = self.value_cache[layer_idx] + key_states = key_states.to(k_out.dtype) + value_states = value_states.to(v_out.dtype) if cache_position is None: k_out.copy_(key_states) @@ -1243,6 +1232,14 @@ def reset(self): self.key_cache[layer_idx].zero_() self.value_cache[layer_idx].zero_() + @property + def batch_size(self): + logger.warning_once( + f"The 'batch_size' attribute of {self.__class__.__name__} is deprecated and will be removed in " + "v4.49. Use the more precisely named 'self.max_batch_size' attribute instead." + ) + return self.max_batch_size + class SlidingWindowCache(StaticCache): """ @@ -1514,7 +1511,10 @@ def crop(self, maximum_length: int): self.check_dynamic_cache(self.crop.__name__) self.self_attention_cache.crop(maximum_length) - def batch_split(self, full_batch_size: int, split_size: int) -> "List[EncoderDecoderCache]": + @deprecate_kwarg("num_hidden_layers", version="4.47.0") + def batch_split( + self, full_batch_size: int, split_size: int, num_hidden_layers: int = None + ) -> "List[EncoderDecoderCache]": """Split the current instance into a list of `DynamicCache` by the batch size. This will be used by `_split_model_inputs()` in `generation.utils`""" self.check_dynamic_cache(self.batch_split.__name__) @@ -1527,7 +1527,10 @@ def batch_split(self, full_batch_size: int, split_size: int) -> "List[EncoderDec return out @classmethod - def from_batch_splits(cls, splits: List["EncoderDecoderCache"]) -> "EncoderDecoderCache": + @deprecate_kwarg("num_hidden_layers", version="4.47.0") + def from_batch_splits( + cls, splits: List["EncoderDecoderCache"], num_hidden_layers: int = None + ) -> "EncoderDecoderCache": """This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in `generation.utils`""" self_attention_cache = DynamicCache() @@ -1609,10 +1612,10 @@ def __init__( layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None, ) -> None: super().__init__() - if max_batch_size is not None: + if batch_size is not None: logger.warning_once( - f"The 'max_batch_size' argument of {self.__class__.__name__} is deprecated and will be removed in " - "v4.46. Use the more precisely named 'batch_size' argument instead." + f"The 'batch_size' argument of {self.__class__.__name__} is deprecated and will be removed in " + "v4.49. Use the more precisely named 'max_batch_size' argument instead." ) if not hasattr(config, "sliding_window") or config.sliding_window is None: raise ValueError( @@ -1621,7 +1624,7 @@ def __init__( "config and it's not set to None." ) self.max_cache_len = max_cache_len - self.batch_size = batch_size or max_batch_size + self.max_batch_size = batch_size or max_batch_size # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads self.head_dim = ( config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads @@ -1741,6 +1744,14 @@ def reset(self): self.key_cache[layer_idx].zero_() self.value_cache[layer_idx].zero_() + @property + def batch_size(self): + logger.warning_once( + f"The 'batch_size' attribute of {self.__class__.__name__} is deprecated and will be removed in " + "v4.49. Use the more precisely named 'self.max_batch_size' attribute instead." + ) + return self.max_batch_size + class MambaCache: """ @@ -1798,20 +1809,20 @@ def __init__( device: Optional[Union[torch.device, str]] = None, max_batch_size: Optional[int] = None, ): - if max_batch_size is not None: + if batch_size is not None: logger.warning_once( - f"The 'max_batch_size' argument of {self.__class__.__name__} is deprecated and will be removed in " - "v4.46. Use the more precisely named 'batch_size' argument instead." + f"The 'batch_size' argument of {self.__class__.__name__} is deprecated and will be removed in " + "v4.49. Use the more precisely named 'max_batch_size' argument instead." ) self.dtype = dtype - self.batch_size = batch_size or max_batch_size + self.max_batch_size = batch_size or max_batch_size self.intermediate_size = config.intermediate_size self.ssm_state_size = config.state_size self.conv_kernel_size = config.conv_kernel self.conv_states: torch.Tensor = torch.zeros( config.num_hidden_layers, - self.batch_size, + self.max_batch_size, self.intermediate_size, self.conv_kernel_size, device=device, @@ -1819,7 +1830,7 @@ def __init__( ) self.ssm_states: torch.Tensor = torch.zeros( config.num_hidden_layers, - self.batch_size, + self.max_batch_size, self.intermediate_size, self.ssm_state_size, device=device, @@ -1849,6 +1860,14 @@ def reset(self): self.conv_states.zero_() self.ssm_states.zero_() + @property + def batch_size(self): + logger.warning_once( + f"The 'batch_size' attribute of {self.__class__.__name__} is deprecated and will be removed in " + "v4.49. Use the more precisely named 'self.max_batch_size' attribute instead." + ) + return self.max_batch_size + class OffloadedStaticCache(StaticCache): """ @@ -1870,6 +1889,9 @@ class OffloadedStaticCache(StaticCache): The default `dtype` to use when initializing the cache. offload_device (`Union[str, torch.device]`, *optional*, defaults to `cpu`): The device to offload to. Defaults to CPU. + layer_device_map (`Dict[int, Union[str, torch.device, int]]`, *optional*): + Mapping between the layers and its device. This is required when you are manually initializing the cache and the model is splitted between differents gpus. + You can know which layers mapped to which device by checking the associated device_map: `model.hf_device_map`. Attributes: key_cache (`List[torch.Tensor]`): @@ -1916,10 +1938,11 @@ def __init__( device: Union[str, torch.device], dtype: Optional[torch.dtype] = None, offload_device: Union[str, torch.device] = torch.device("cpu"), + layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None, ) -> None: self.max_batch_size = max_batch_size self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len - self.device = torch.device(device) + self.device = torch.device(device) if layer_device_map is None else layer_device_map[0] self.offload_device = torch.device(offload_device) self.dtype = dtype if dtype is not None else torch.float32 @@ -1927,7 +1950,9 @@ def __init__( head_dim = config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads num_key_value_heads = ( - config.num_attention_heads if config.num_key_value_heads is None else config.num_key_value_heads + config.num_attention_heads + if getattr(config, "num_key_value_heads", None) is None + else config.num_key_value_heads ) cache_shape = (max_batch_size, num_key_value_heads, self.max_cache_len, head_dim) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 60f9f34cf861c9..e49eab86b4e12f 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -71,6 +71,8 @@ class PretrainedConfig(PushToHubMixin): outputs of the model during inference. - **attribute_map** (`Dict[str, str]`) -- A dict that maps model specific attribute names to the standardized naming of attributes. + - **base_model_tp_plan** (`Dict[str, Any]`) -- A dict that maps sub-modules FQNs of a base model to a tensor + parallel plan applied to the sub-module when `model.tensor_parallel` is called. Common attributes (present in all subclasses): @@ -194,6 +196,7 @@ class PretrainedConfig(PushToHubMixin): sub_configs: Dict[str, "PretrainedConfig"] = {} is_composition: bool = False attribute_map: Dict[str, str] = {} + base_model_tp_plan: Optional[Dict[str, Any]] = None _auto_class: Optional[str] = None def __setattr__(self, key, value): @@ -848,6 +851,9 @@ def to_diff_dict(self) -> Dict[str, Any]: if "_attn_implementation_internal" in serializable_config_dict: del serializable_config_dict["_attn_implementation_internal"] + # Do not serialize `base_model_tp_plan` for now + if "base_model_tp_plan" in serializable_config_dict: + del serializable_config_dict["base_model_tp_plan"] return serializable_config_dict @@ -867,6 +873,9 @@ def to_dict(self) -> Dict[str, Any]: del output["_commit_hash"] if "_attn_implementation_internal" in output: del output["_attn_implementation_internal"] + # Do not serialize `base_model_tp_plan` for now + if "base_model_tp_plan" in output: + del output["base_model_tp_plan"] # Transformers version when serializing the model output["transformers_version"] = __version__ diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 9543b58ad40d91..85345cc8e5889d 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -84,8 +84,8 @@ "tf2onnx": "tf2onnx", "timeout-decorator": "timeout-decorator", "tiktoken": "tiktoken", - "timm": "timm<=0.9.16", - "tokenizers": "tokenizers>=0.20,<0.21", + "timm": "timm<=1.0.11", + "tokenizers": "tokenizers>=0.21,<0.22", "torch": "torch", "torchaudio": "torchaudio", "torchvision": "torchvision", diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py index f3cde8180c1bd4..6e8007edbc0b78 100644 --- a/src/transformers/feature_extraction_utils.py +++ b/src/transformers/feature_extraction_utils.py @@ -213,6 +213,7 @@ def to(self, *args, **kwargs) -> "BatchFeature": Will be passed to the `to(...)` function of the tensors. kwargs (`Dict`, *optional*): Will be passed to the `to(...)` function of the tensors. + To enable asynchronous data transfer, set the `non_blocking` flag in `kwargs` (defaults to `False`). Returns: [`BatchFeature`]: The same instance after modification. @@ -222,6 +223,7 @@ def to(self, *args, **kwargs) -> "BatchFeature": new_data = {} device = kwargs.get("device") + non_blocking = kwargs.get("non_blocking", False) # Check if the args are a device or a dtype if device is None and len(args) > 0: # device should be always the first argument @@ -241,7 +243,7 @@ def to(self, *args, **kwargs) -> "BatchFeature": # cast and send to device new_data[k] = v.to(*args, **kwargs) elif isinstance(v, torch.Tensor) and device is not None: - new_data[k] = v.to(device=device) + new_data[k] = v.to(device=device, non_blocking=non_blocking) else: new_data[k] = v self.data = new_data diff --git a/src/transformers/generation/__init__.py b/src/transformers/generation/__init__.py index b487fa3c7fe6ec..59d970db15416f 100644 --- a/src/transformers/generation/__init__.py +++ b/src/transformers/generation/__init__.py @@ -20,6 +20,7 @@ _import_structure = { "configuration_utils": [ "BaseWatermarkingConfig", + "CompileConfig", "GenerationConfig", "GenerationMode", "SynthIDTextWatermarkingConfig", @@ -49,6 +50,7 @@ _import_structure["candidate_generator"] = [ "AssistedCandidateGenerator", "CandidateGenerator", + "EarlyExitCandidateGenerator", "PromptLookupCandidateGenerator", ] _import_structure["logits_process"] = [ @@ -191,6 +193,7 @@ if TYPE_CHECKING: from .configuration_utils import ( BaseWatermarkingConfig, + CompileConfig, GenerationConfig, GenerationMode, SynthIDTextWatermarkingConfig, @@ -206,7 +209,12 @@ else: from .beam_constraints import Constraint, ConstraintListState, DisjunctiveConstraint, PhrasalConstraint from .beam_search import BeamHypotheses, BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer - from .candidate_generator import AssistedCandidateGenerator, CandidateGenerator, PromptLookupCandidateGenerator + from .candidate_generator import ( + AssistedCandidateGenerator, + CandidateGenerator, + EarlyExitCandidateGenerator, + PromptLookupCandidateGenerator, + ) from .logits_process import ( AlternatingCodebooksLogitsProcessor, ClassifierFreeGuidanceLogitsProcessor, diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 1e4d7a4702453a..9a62b5709b5f43 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -19,6 +19,12 @@ import numpy as np import torch +from ..utils import is_sklearn_available + + +if is_sklearn_available(): + from sklearn.metrics import roc_curve + from ..cache_utils import DynamicCache from ..pytorch_utils import isin_mps_friendly from .logits_process import LogitsProcessorList, MinLengthLogitsProcessor @@ -180,6 +186,14 @@ def __init__( # We need to roll back the cache in assisted generation, only DynamicCache is supported self.generation_config.cache_implementation = None + if ( + is_sklearn_available() + and self.assistant_model.generation_config.assistant_confidence_threshold + and type(self) is AssistedCandidateGenerator + ): + self.probs = [] + self.matches = [] + def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]: """ Fetches the candidates to be tried for the current input. @@ -230,6 +244,17 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, # 3. Update variables for the next round of candidate generation self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values + if ( + is_sklearn_available() + and self.assistant_model.generation_config.assistant_confidence_threshold + and type(self) is AssistedCandidateGenerator + ): + scores_tensor = torch.cat(assistant_output.scores, dim=0) + scores_softmax = torch.softmax(scores_tensor, dim=-1) + ids = assistant_output.sequences[-1, -len(assistant_output.scores) :] + p = scores_softmax[range(len(ids)), ids] + self.probs.extend(p.tolist()) + # 4. Prepare variables for output candidate_logits = torch.stack(assistant_output.scores, dim=1) candidate_ids = assistant_output.sequences @@ -255,11 +280,44 @@ def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.F "heuristic", "heuristic_transient", }: - if num_matches == int(self.num_assistant_tokens): + # len(scores[0])-1 is the number of candidates according to the target tokenizer. + if num_matches == len(scores[0]) - 1: self.num_assistant_tokens += 2.0 else: self.num_assistant_tokens = max(1.0, self.num_assistant_tokens - 1.0) + # The assistant's confidence threshold is adjusted throughout the speculative iterations to reduce the number of unnecessary draft and target forward passes. The costs are estimated based on the ROC curve, which considers the probability of the draft token and its match with the target. A cost of 25% is assigned to false positives and 75% to false negatives. + # This adaptation is not compatible with UAG, as it relies on the number of matched tokens based on the draft vocabulary, which is unavailable in UAG. + if ( + is_sklearn_available() + and self.assistant_model.generation_config.assistant_confidence_threshold + and type(self) is AssistedCandidateGenerator + ): + # update self.matches + self.matches.extend([1] * num_matches) + if len(self.probs) > len(self.matches): + self.matches.append(0) + + # update self.probs + excess_length = len(self.probs) - len(self.matches) + if excess_length > 0: + del self.probs[-excess_length:] + + if ( + len(self.probs) > 5 and {0, 1}.issubset(self.matches) + ): # require at least 5 samples to calculate the ROC curve and at least one positive and one negative sample + fpr, tpr, thresholds = roc_curve(self.matches, self.probs) + fnr = 1 - tpr + + # Calculate the cost for each threshold + costs = fpr + 3 * fnr + + # Find the threshold that minimizes the cost + optimal_threshold_index = np.argmin(costs) + best_threshold = thresholds[optimal_threshold_index] + + self.assistant_model.generation_config.assistant_confidence_threshold = best_threshold + class AssistedCandidateGeneratorDifferentTokenizers(AssistedCandidateGenerator): """ @@ -309,10 +367,9 @@ def __init__( self.target_tokenizer = target_tokenizer self.assistant_tokenizer = assistant_tokenizer - self.prev_tokens = None self.prev_assistant_ids = None - self.target_lookbehind = 10 - self.assistant_lookbehind = 10 + self.target_lookbehind = assistant_model.generation_config.target_lookbehind + self.assistant_lookbehind = assistant_model.generation_config.assistant_lookbehind @staticmethod def _get_longest_diag_dict(input_matrix, nonzero_idx): @@ -449,9 +506,9 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, # Since re-encoding the tokens may result in tokenization discrepancies, we use 2 look behind values # (one for each conversion) which mark where to start looking for the overlap between the # source and target encodings, to ensure the new tokens include the correct prompt suffix. - if self.prev_tokens is not None and self.prev_target_ids.shape[1] > self.target_lookbehind: + if self.prev_assistant_ids is not None and input_ids.shape[1] > self.target_lookbehind: # input_ids contains all target prompt input ids and some new target input ids - start_index_in_target_window = self.prev_target_ids.shape[1] - self.target_lookbehind + start_index_in_target_window = input_ids.shape[1] - self.target_lookbehind new_assistant_ids = self.convert_source_tokens_to_target_tokens( input_ids[:, start_index_in_target_window:], **convert_kwargs @@ -484,7 +541,6 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, else: assistant_input_ids = self.convert_source_tokens_to_target_tokens(input_ids, **convert_kwargs) - self.prev_target_ids = input_ids self.prev_assistant_ids = assistant_input_ids new_cur_len = assistant_input_ids.shape[-1] @@ -519,6 +575,8 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, num_prev_assistant = self.prev_assistant_ids.shape[1] start_assistant_look_index = num_prev_assistant - self.assistant_lookbehind + if start_assistant_look_index < 0: + start_assistant_look_index = 0 new_target_ids_from_window = self.convert_source_tokens_to_target_tokens( assistant_output.sequences[:, start_assistant_look_index:], @@ -542,14 +600,11 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, # edge case: in case of no intersection between prompt and new_target_ids new_target_ids = torch.cat([new_target_ids, new_target_ids_from_window], dim=-1) - self.prev_target_ids = input_ids - if hasattr(self.generation_config, "max_length"): new_target_ids = new_target_ids[:, : self.generation_config.max_length] # 3. Update variables for the next round of candidate generation self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values - self.prev_tokens = assistant_output.sequences # 4. Prepare variables for output if input_ids.shape[1] >= new_target_ids.shape[1]: @@ -670,6 +725,62 @@ def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.F return +class EarlyExitCandidateGenerator(AssistedCandidateGenerator): + """ + `CandidateGenerator` class to be used for assisted generation and speculative decoding. This class generates + candidates through the use of **the model itself**, exiting early. Can only be used with models that support early + exit, e.g., `facebook/layerskip-llama3.2-1B`. + + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids) + assistant_model (`PreTrainedModel`): + The original model. This model must support early exit (i.e. is trained to compute logits in earlier + layers). + generation_config (`~generation.GenerationConfig`, *optional*): + The generation configuration to be used as base parametrization for the generation call. + logits_processor (`LogitsProcessorList`): + An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] + used to modify the prediction scores of the language modeling head applied at each generation step. + model_kwargs (`Dict`): + The keyword arguments that will be passed to the main model, and are used as base inputs for the assistant + model as well. + inputs_tensor (`torch.Tensor`, *optional*): + The model input tensor. In encoder-decoder models, this is the encoder input. + """ + + def __init__( + self, + input_ids: torch.LongTensor, + assistant_model: "PreTrainedModel", + generation_config: "GenerationConfig", + model_kwargs: Dict, + inputs_tensor: Optional[torch.Tensor] = None, + logits_processor: "LogitsProcessorList" = None, + ): + super().__init__( + input_ids=input_ids, + assistant_model=assistant_model, + generation_config=generation_config, + model_kwargs=model_kwargs, + inputs_tensor=inputs_tensor, + logits_processor=logits_processor, + ) + # We have to move early exit out of the generation config, otherwise the assistant will also call `generate` + # with early exit + self.assistant_early_exit = self.generation_config.assistant_early_exit + self.generation_config.assistant_early_exit = None + + def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]: + # Temporarily sets the number of hidden layers to the early exit value + base_model = getattr(self.assistant_model, self.assistant_model.base_model_prefix) + original_num_hidden_layers = base_model.config.num_hidden_layers + base_model.config.num_hidden_layers = self.assistant_early_exit + candidate_ids, candidate_logits = super().get_candidates(input_ids) + base_model.config.num_hidden_layers = original_num_hidden_layers + return candidate_ids, candidate_logits + + def _crop_past_key_values(model, past_key_values, max_length): """Crops the past key values up to a certain maximum length.""" new_past = [] diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py index 9b543f6c35711d..18cf26b8b73415 100644 --- a/src/transformers/generation/configuration_utils.py +++ b/src/transformers/generation/configuration_utils.py @@ -20,7 +20,7 @@ import warnings from abc import ABC, abstractmethod from dataclasses import dataclass, is_dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union from .. import __version__ from ..configuration_utils import PretrainedConfig @@ -72,7 +72,9 @@ "mamba": MambaCache, } QUANT_BACKEND_CLASSES_MAPPING = {"quanto": QuantoQuantizedCache, "HQQ": HQQQuantizedCache} - ALL_CACHE_IMPLEMENTATIONS = list(NEED_SETUP_CACHE_CLASSES_MAPPING.keys()) + list(NEEDS_CACHE_CONFIG.keys()) + ALL_CACHE_IMPLEMENTATIONS = ( + list(NEED_SETUP_CACHE_CLASSES_MAPPING.keys()) + list(NEEDS_CACHE_CONFIG.keys()) + ["offloaded"] + ) class GenerationMode(ExplicitEnum): @@ -193,12 +195,12 @@ class GenerationConfig(PushToHubMixin): > Parameters for manipulation of the model output logits temperature (`float`, *optional*, defaults to 1.0): - The value used to modulate the next token probabilities. + The value used to module the next token probabilities. This value is set in a model's `generation_config.json` file. If it isn't set, the default value is 1.0 top_k (`int`, *optional*, defaults to 50): - The number of highest probability vocabulary tokens to keep for top-k-filtering. + The number of highest probability vocabulary tokens to keep for top-k-filtering. This value is set in a model's `generation_config.json` file. If it isn't set, the default value is 50. top_p (`float`, *optional*, defaults to 1.0): If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to - `top_p` or higher are kept for generation. + `top_p` or higher are kept for generation. This value is set in a model's `generation_config.json` file. If it isn't set, the default value is 1.0 min_p (`float`, *optional*): Minimum token probability, which will be scaled by the probability of the most likely token. It must be a value between 0 and 1. Typical values are in the 0.01-0.2 range, comparably selective as setting `top_p` in @@ -351,12 +353,31 @@ class GenerationConfig(PushToHubMixin): assistant_confidence_threshold (`float`, *optional*, defaults to 0.4): The confidence threshold for the assistant model. If the assistant model's confidence in its prediction for the current token is lower than this threshold, the assistant model stops the current token generation iteration, even if the number of _speculative tokens_ - (defined by `num_assistant_tokens`) is not yet reached. It is an unsupervised version of the dynamic speculation lookahead + (defined by `num_assistant_tokens`) is not yet reached. The assistant's confidence threshold is adjusted throughout the speculative iterations to reduce the number of unnecessary draft and target forward passes, biased towards avoiding false negatives. + `assistant_confidence_threshold` value is persistent over multiple generation calls with the same assistant model. + It is an unsupervised version of the dynamic speculation lookahead from Dynamic Speculation Lookahead Accelerates Speculative Decoding of Large Language Models . - prompt_lookup_num_tokens (`int`, *optional*, default to `None`): + prompt_lookup_num_tokens (`int`, *optional*): The number of tokens to be output as candidate tokens. - max_matching_ngram_size (`int`, *optional*, default to `None`): + max_matching_ngram_size (`int`, *optional*): The maximum ngram size to be considered for matching in the prompt. Default to 2 if not provided. + assistant_early_exit(`int`, *optional*): + If set to a positive integer, early exit of the model will be used as an assistant. Can only be used with + models that support early exit (i.e. models where logits from intermediate layers can be interpreted by the LM head). + assistant_lookbehind(`int`, *optional*, defaults to 10): + If set to a positive integer, the re-encodeing process will additionally consider the last `assistant_lookbehind` assistant tokens + to correctly align tokens. Can only be used with different tokenizers in speculative decoding. + See this [blog](https://huggingface.co/blog/universal_assisted_generation) for more details. + target_lookbehind(`int`, *optional*, defaults to 10): + If set to a positive integer, the re-encodeing process will additionally consider the last `target_lookbehind` target tokens + to correctly align tokens. Can only be used with different tokenizers in speculative decoding. + See this [blog](https://huggingface.co/blog/universal_assisted_generation) for more details. + + > Parameters related to performances and compilation + + compile_config (CompileConfig, *optional*): + If using a static cache, this controls how `generate` will `compile` the forward pass for performance + gains. > Wild card @@ -454,10 +475,15 @@ def __init__(self, **kwargs): self.num_assistant_tokens = kwargs.pop("num_assistant_tokens", 20) self.num_assistant_tokens_schedule = kwargs.pop("num_assistant_tokens_schedule", "constant") self.assistant_confidence_threshold = kwargs.pop("assistant_confidence_threshold", 0.4) - - # Prompt lookup decoding self.prompt_lookup_num_tokens = kwargs.pop("prompt_lookup_num_tokens", None) self.max_matching_ngram_size = kwargs.pop("max_matching_ngram_size", None) + self.assistant_early_exit = kwargs.pop("assistant_early_exit", None) + ## assistant generation for different tokenizers, the windows size for assistant/target model + self.assistant_lookbehind = kwargs.pop("assistant_lookbehind", 10) + self.target_lookbehind = kwargs.pop("target_lookbehind", 10) + + # Performances + self.compile_config = kwargs.pop("compile_config", CompileConfig()) # Wild card self.generation_kwargs = kwargs.pop("generation_kwargs", {}) @@ -534,7 +560,11 @@ def get_generation_mode(self, assistant_model: Optional["PreTrainedModel"] = Non generation_mode = GenerationMode.BEAM_SEARCH # Assisted generation may extend some generation modes - if assistant_model is not None or self.prompt_lookup_num_tokens is not None: + if ( + assistant_model is not None + or self.prompt_lookup_num_tokens is not None + or self.assistant_early_exit is not None + ): if generation_mode in ("greedy_search", "sample"): generation_mode = GenerationMode.ASSISTED_GENERATION else: @@ -775,7 +805,13 @@ def validate(self, is_init=False): self.watermarking_config = WatermarkingConfig.from_dict(self.watermarking_config) self.watermarking_config.validate() - # 7. other incorrect combinations + # 7. performances arguments + if not isinstance(self.compile_config, CompileConfig): + raise ValueError( + f"You provided `compile_config` as an instance of {type(self.compile_config)}, but it must be an instance of `CompileConfig`." + ) + + # 8. other incorrect combinations if self.return_dict_in_generate is not True: for extra_output_flag in self.extra_output_flags: if getattr(self, extra_output_flag) is True: @@ -1156,6 +1192,8 @@ def to_dict(self) -> Dict[str, Any]: del output["_commit_hash"] if "_original_object_hash" in output: del output["_original_object_hash"] + if "compile_config" in output: + del output["compile_config"] # Transformers version when serializing this file output["transformers_version"] = __version__ @@ -1540,3 +1578,51 @@ def construct_processor(self, vocab_size: int, device) -> "WatermarkLogitsProces skip_first_ngram_calls=self.skip_first_ngram_calls, debug_mode=self.debug_mode, ) + + +@dataclass +class CompileConfig(object): + """ + Class that holds arguments relative to `torch.compile` behavior, when using automatic compilation in `generate`. + See [`torch.compile`](https://pytorch.org/docs/stable/generated/torch.compile.html) for more details on the arguments. + + Args: + fullgraph (`bool`, *optional*, defaults to `True`): + If `True`, requires that the whole forward be capturable in a single graph. + dynamic (`bool` or `None`, *optional*): + Whether to try to use dynamic shape graphs. + backend (`str` or `Callable`, *optional*, defaults to `"inductor"`): + Backend to be used. + mode (`str`, *optional*, defaults to `"reduce-overhead"`): + Controls balance between performance and overhead. + options (`dict`, *optional*): + A dictionary of options to pass to the backend. + + Examples: + ```python + >>> from transformers import AutoModelForCausalLM, AutoTokenizer, CompileConfig + + >>> tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-2b') + >>> model = AutoModelForCausalLM.from_pretrained('google/gemma-2-2b').cuda() + + >>> # Automatic compile configuration, used with static cache + >>> compile_config = CompileConfig(dynamic=True) + + >>> # Generation with static cache and compile config + >>> input = tokenizer.encode("Hello there, how", return_tensors="pt").cuda() + >>> output = model.generate( + ... input, do_sample=False, max_new_tokens=300, cache_implementation="static", compile_config=compile_config + ... ) + >>> output_text = tokenizer.batch_decode(output, skip_special_tokens=True)[0] + ``` + """ + + fullgraph: bool = True + dynamic: Optional[bool] = None + backend: Union[str, Callable] = "inductor" + mode: str = "reduce-overhead" + options: Optional[dict] = None + + def to_dict(self) -> Dict[str, Any]: + """Serializes this instance to a Python dictionary.""" + return copy.deepcopy(self.__dict__) diff --git a/src/transformers/generation/flax_logits_process.py b/src/transformers/generation/flax_logits_process.py index 9b2ab5fb1afa47..d106c32defa4db 100644 --- a/src/transformers/generation/flax_logits_process.py +++ b/src/transformers/generation/flax_logits_process.py @@ -273,7 +273,7 @@ class FlaxSuppressTokensAtBeginLogitsProcessor(FlaxLogitsProcessor): r""" [`FlaxLogitsProcessor`] supressing a list of tokens as soon as the `generate` function starts generating using `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` are not sampled at the - begining of the generation. + beginning of the generation. Args: begin_suppress_tokens (`List[int]`): diff --git a/src/transformers/generation/flax_utils.py b/src/transformers/generation/flax_utils.py index 88535b44e9c479..8e87ead7fdd5a9 100644 --- a/src/transformers/generation/flax_utils.py +++ b/src/transformers/generation/flax_utils.py @@ -398,7 +398,11 @@ def generate( ) generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length else: # by default let's always generate 10 new tokens - generation_config.max_length = generation_config.max_length + input_ids_seq_length + if generation_config.max_length == GenerationConfig().max_length: + generation_config.max_length = generation_config.max_length + input_ids_seq_length + max_position_embeddings = getattr(self.config, "max_position_embeddings", None) + if max_position_embeddings is not None: + generation_config.max_length = min(generation_config.max_length, max_position_embeddings) if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length: raise ValueError( diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py index 9d244191da811c..39a38f9139ec1b 100644 --- a/src/transformers/generation/logits_process.py +++ b/src/transformers/generation/logits_process.py @@ -1782,7 +1782,7 @@ class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor): r""" [`SuppressTokensAtBeginLogitsProcessor`] supresses a list of tokens as soon as the `generate` function starts generating using `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` are - not generated at the begining. Originally created for + not generated at the beginning. Originally created for [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper). Examples: diff --git a/src/transformers/generation/tf_logits_process.py b/src/transformers/generation/tf_logits_process.py index 91e20fe02f7f4f..f70655fb7c1d0b 100644 --- a/src/transformers/generation/tf_logits_process.py +++ b/src/transformers/generation/tf_logits_process.py @@ -512,7 +512,7 @@ class TFSuppressTokensAtBeginLogitsProcessor(TFLogitsProcessor): r""" [`TFSuppressTokensAtBeginLogitsProcessor`] suppresses a list of tokens as soon as the `generate` function starts generating using `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` at not - sampled at the begining of the generation. + sampled at the beginning of the generation. """ def __init__(self, begin_suppress_tokens, begin_index): diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 6e6d5b8bdce71d..89c57cb913fec2 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -15,6 +15,7 @@ # limitations under the License. import copy import inspect +import os import warnings from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union @@ -44,7 +45,6 @@ is_accelerate_available, is_hqq_available, is_optimum_quanto_available, - is_quanto_available, is_torchdynamo_compiling, logging, ) @@ -54,6 +54,7 @@ AssistedCandidateGenerator, AssistedCandidateGeneratorDifferentTokenizers, CandidateGenerator, + EarlyExitCandidateGenerator, PromptLookupCandidateGenerator, _crop_past_key_values, _prepare_attention_mask, @@ -419,7 +420,12 @@ def prepare_inputs_for_generation( model_input = kwargs.get(model_input_name) if model_input is not None: if past_key_values is not None: - model_input = model_input[:, -input_ids.shape[1] :] + current_input_length = ( + model_inputs["inputs_embeds"].shape[1] + if model_inputs["inputs_embeds"] is not None + else model_inputs[input_ids_key].shape[1] + ) + model_input = model_input[:, -current_input_length:] model_input = model_input.clone(memory_format=torch.contiguous_format) model_inputs[model_input_name] = model_input @@ -822,7 +828,16 @@ def _get_candidate_generator( """ different_tokenizers = all(v is not None for v in (assistant_model, target_tokenizer, assistant_tokenizer)) - if generation_config.prompt_lookup_num_tokens is not None: + if generation_config.assistant_early_exit is not None: + candidate_generator = EarlyExitCandidateGenerator( + input_ids=input_ids, + assistant_model=self, + generation_config=generation_config, + model_kwargs=model_kwargs, + inputs_tensor=inputs_tensor, + logits_processor=logits_processor, + ) + elif generation_config.prompt_lookup_num_tokens is not None: candidate_generator = PromptLookupCandidateGenerator( eos_token_id=generation_config._eos_token_tensor, num_output_tokens=generation_config.prompt_lookup_num_tokens, @@ -1019,10 +1034,6 @@ def _get_logits_processor( "You have explicitly specified `forced_decoder_ids`. Please remove the `forced_decoder_ids` argument " "in favour of `input_ids` or `decoder_input_ids` respectively.", ) - if generation_config.watermarking_config is not None: - processors.append( - generation_config.watermarking_config.construct_processor(self.config.vocab_size, device) - ) # TODO (joao): find a strategy to specify the order of the processors processors = self._merge_criteria_processor_list(processors, logits_processor) @@ -1075,6 +1086,12 @@ def _get_logits_processor( ) ) + # Watermarking should be after all logits processing is finished (see #34630) + if generation_config.watermarking_config is not None: + processors.append( + generation_config.watermarking_config.construct_processor(self.config.vocab_size, device) + ) + # `LogitNormalization` should always be the last logit processor, when present if generation_config.renormalize_logits is True: processors.append(LogitNormalization()) @@ -1448,14 +1465,16 @@ def _prepare_generated_length( elif ( model_input_name == "inputs_embeds" and input_ids_length != inputs_tensor.shape[1] + and input_ids_length != 0 and not self.config.is_encoder_decoder ): generation_config.max_length -= inputs_tensor.shape[1] elif has_default_max_length: # by default let's always generate 20 new tokens - generation_config.max_length = generation_config.max_length + input_ids_length - max_position_embeddings = getattr(self.config, "max_position_embeddings", None) - if max_position_embeddings is not None: - generation_config.max_length = min(generation_config.max_length, max_position_embeddings) + if generation_config.max_length == GenerationConfig().max_length: + generation_config.max_length = generation_config.max_length + input_ids_length + max_position_embeddings = getattr(self.config, "max_position_embeddings", None) + if max_position_embeddings is not None: + generation_config.max_length = min(generation_config.max_length, max_position_embeddings) # same for min length if generation_config.min_new_tokens is not None: @@ -1591,7 +1610,7 @@ def _get_cache( need_new_cache = ( not hasattr(self, "_cache") or (not isinstance(cache_to_check, cache_cls)) - or cache_to_check.batch_size != batch_size + or cache_to_check.max_batch_size != batch_size ) if cache_implementation != "mamba": need_new_cache = need_new_cache or cache_to_check.max_cache_len < max_cache_len @@ -1635,7 +1654,10 @@ def get_layer_device_map(execution_device_map: Optional[dict] = None): # This is needed here if we don't want to make changes in accelerate in order to save execution_device # For offloaded case, we need to get the execution device, not just the device where it is offloaded if hasattr(self, "hf_device_map"): - main_device = [d for d in self.hf_device_map.values() if d not in ["cpu", "disk"]][0] + if set(self.hf_device_map.values()) == {"cpu"} or set(self.hf_device_map.values()) == {"cpu", "disk"}: + main_device = "cpu" + else: + main_device = [d for d in self.hf_device_map.values() if d not in ["cpu", "disk"]][0] execution_device_map = { name: main_device if device in ["cpu", "disk"] else device for name, device in self.hf_device_map.items() @@ -1644,7 +1666,7 @@ def get_layer_device_map(execution_device_map: Optional[dict] = None): cache_kwargs = { "config": self.config.get_text_config(), - "batch_size": batch_size, + "max_batch_size": batch_size, "max_cache_len": max_cache_len, "device": device, "dtype": cache_dtype, @@ -1765,7 +1787,7 @@ def _prepare_cache_for_generation( ) cache_class = QUANT_BACKEND_CLASSES_MAPPING[cache_config.backend] - if cache_config.backend == "quanto" and not (is_optimum_quanto_available() or is_quanto_available()): + if cache_config.backend == "quanto" and not is_optimum_quanto_available(): raise ImportError( "You need to install optimum-quanto in order to use KV cache quantization with optimum-quanto backend. " "Please install it via with `pip install optimum-quanto`" @@ -1844,8 +1866,8 @@ def _tensor_or_none(token, device=None): "The attention mask and the pad token id were not set. As a consequence, you may observe " "unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results." ) - logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{pad_token_tensor} for open-end generation.") pad_token_tensor = eos_token_tensor[0] + logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{pad_token_tensor} for open-end generation.") # Sanity checks/warnings if self.config.is_encoder_decoder and decoder_start_token_tensor is None: @@ -3208,6 +3230,14 @@ def _sample( unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device) model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) + model_forward = self.__call__ + if isinstance(model_kwargs.get("past_key_values"), StaticCache): + if self.device.type == "cuda": + logger.warning_once("Using `torch.compile`.") + os.environ["TOKENIZERS_PARALLELISM"] = "0" + model_forward = self.get_compiled_call(generation_config.compile_config) + + is_prefill = True while self._has_unfinished_sequences( this_peer_finished, synced_gpus, device=input_ids.device, cur_len=cur_len, max_length=max_length ): @@ -3218,8 +3248,11 @@ def _sample( model_inputs.update({"output_attentions": output_attentions} if output_attentions else {}) model_inputs.update({"output_hidden_states": output_hidden_states} if output_hidden_states else {}) - # forward pass to get next token - outputs = self(**model_inputs, return_dict=True) + if is_prefill: + outputs = self(**model_inputs, return_dict=True) + is_prefill = False + else: + outputs = model_forward(**model_inputs, return_dict=True) # synced_gpus: don't waste resources running the code we don't need; kwargs must be updated before skipping model_kwargs = self._update_model_kwargs_for_generation( @@ -3232,7 +3265,7 @@ def _sample( # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration # (the clone itself is always small) - next_token_logits = outputs.logits.clone()[:, -1, :].float() + next_token_logits = outputs.logits[:, -1, :].clone().float() next_token_logits = next_token_logits.to(input_ids.device) # pre-process distribution diff --git a/src/transformers/image_processing_base.py b/src/transformers/image_processing_base.py index bc7fd228edcc8c..e73d4a8a56f311 100644 --- a/src/transformers/image_processing_base.py +++ b/src/transformers/image_processing_base.py @@ -19,7 +19,7 @@ import os import warnings from io import BytesIO -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Type, TypeVar, Union import numpy as np import requests @@ -45,6 +45,9 @@ from PIL import Image +ImageProcessorType = TypeVar("ImageProcessorType", bound="ImageProcessingMixin") + + logger = logging.get_logger(__name__) @@ -95,7 +98,7 @@ def _set_processor_class(self, processor_class: str): @classmethod def from_pretrained( - cls, + cls: Type[ImageProcessorType], pretrained_model_name_or_path: Union[str, os.PathLike], cache_dir: Optional[Union[str, os.PathLike]] = None, force_download: bool = False, @@ -103,7 +106,7 @@ def from_pretrained( token: Optional[Union[str, bool]] = None, revision: str = "main", **kwargs, - ): + ) -> ImageProcessorType: r""" Instantiate a type of [`~image_processing_utils.ImageProcessingMixin`] from an image processor. diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index f59b99b490d38d..51199d9f3698fc 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -24,6 +24,7 @@ from .utils import ( ExplicitEnum, + TensorType, is_jax_tensor, is_numpy_array, is_tf_tensor, @@ -447,6 +448,44 @@ def validate_preprocess_arguments( raise ValueError("`size` and `resample` must be specified if `do_resize` is `True`.") +def validate_fast_preprocess_arguments( + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_pad: Optional[bool] = None, + size_divisibility: Optional[int] = None, + do_center_crop: Optional[bool] = None, + crop_size: Optional[Dict[str, int]] = None, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample: Optional["PILImageResampling"] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, +): + """ + Checks validity of typically used arguments in an `ImageProcessorFast` `preprocess` method. + Raises `ValueError` if arguments incompatibility is caught. + """ + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + # Extra checks for ImageProcessorFast + if return_tensors != "pt": + raise ValueError("Only returning PyTorch tensors is currently supported.") + + if data_format != ChannelDimension.FIRST: + raise ValueError("Only channel first data format is currently supported.") + + # In the future we can add a TF implementation here when we have TF models. class ImageFeatureExtractionMixin: """ diff --git a/src/transformers/integrations/bitnet.py b/src/transformers/integrations/bitnet.py index 3386bdcb43b27c..0b50f9738afb69 100644 --- a/src/transformers/integrations/bitnet.py +++ b/src/transformers/integrations/bitnet.py @@ -127,6 +127,8 @@ class BitLinear(nn.Module): def __init__(self, in_features: int, out_features: int, bias: bool, device=None, dtype=None): super().__init__() self.dtype = dtype + self.in_features = in_features + self.out_features = out_features self.register_buffer( "weight", torch.zeros( diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index f4545f2698c017..57f0af5667e648 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -248,6 +248,20 @@ "output_norm": "backbone.norm_f", "output.weight": "lm_head.weight", }, + "nemotron": { + "token_embd": "model.embed_tokens", + "blk": "model.layers", + "ffn_up": "mlp.up_proj", + "ffn_down": "mlp.down_proj", + "ffn_norm": "post_attention_layernorm", + "attn_norm": "input_layernorm", + "attn_q": "self_attn.q_proj", + "attn_v": "self_attn.v_proj", + "attn_k": "self_attn.k_proj", + "attn_output": "self_attn.o_proj", + "output.weight": "lm_head.weight", + "output_norm": "model.norm", + }, } @@ -397,6 +411,18 @@ "ssm.time_step_rank": "time_step_rank", "ssm.inner_size": "intermediate_size", }, + "nemotron": { + "context_length": "max_position_embeddings", + "block_count": "num_hidden_layers", + "feed_forward_length": "intermediate_size", + "embedding_length": "hidden_size", + "rope.dimension_count": None, + "rope.freq_base": "rope_theta", + "attention.head_count": "num_attention_heads", + "attention.head_count_kv": "num_key_value_heads", + "attention.layer_norm_rms_epsilon": "norm_eps", + "vocab_size": "vocab_size", + }, } GGUF_TOKENIZER_MAPPING = { @@ -793,6 +819,7 @@ def converted(self) -> Tokenizer: "starcoder2": GGUFGPTConverter, "t5": GGUFT5Converter, "mamba": GGUFGPTConverter, + "nemotron": GGUFGPTConverter, } diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py index 4b236b9155f158..0cc2685a55206f 100755 --- a/src/transformers/integrations/integration_utils.py +++ b/src/transformers/integrations/integration_utils.py @@ -208,7 +208,7 @@ def hp_params(trial): if is_optuna_available(): import optuna - if isinstance(trial, optuna.Trial): + if isinstance(trial, optuna.trial.BaseTrial): return trial.params if is_ray_tune_available(): if isinstance(trial, dict): @@ -230,7 +230,7 @@ def run_hp_search_optuna(trainer, n_trials: int, direction: str, **kwargs) -> Be if trainer.args.process_index == 0: - def _objective(trial, checkpoint_dir=None): + def _objective(trial: optuna.Trial, checkpoint_dir=None): checkpoint = None if checkpoint_dir: for subdir in os.listdir(checkpoint_dir): @@ -240,10 +240,11 @@ def _objective(trial, checkpoint_dir=None): if trainer.args.world_size > 1: if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED: raise RuntimeError("only support DDP optuna HPO for ParallelMode.DISTRIBUTED currently.") - trainer._hp_search_setup(trial) - args_main_rank_list = [pickle.dumps(trainer.args)] - torch.distributed.broadcast_object_list(args_main_rank_list, src=0) - trainer.train(resume_from_checkpoint=checkpoint) + trainer.hp_space(trial) + fixed_trial = optuna.trial.FixedTrial(trial.params, trial.number) + trial_main_rank_list = [fixed_trial] + torch.distributed.broadcast_object_list(trial_main_rank_list, src=0) + trainer.train(resume_from_checkpoint=checkpoint, trial=trial) else: trainer.train(resume_from_checkpoint=checkpoint, trial=trial) # If there hasn't been any evaluation during the training loop. @@ -268,15 +269,11 @@ def _objective(trial, checkpoint_dir=None): else: for i in range(n_trials): trainer.objective = None - args_main_rank_list = [None] + trial_main_rank_list = [None] if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED: raise RuntimeError("only support DDP optuna HPO for ParallelMode.DISTRIBUTED currently.") - torch.distributed.broadcast_object_list(args_main_rank_list, src=0) - args = pickle.loads(bytes(args_main_rank_list[0])) - for key, value in asdict(args).items(): - if key != "local_rank": - setattr(trainer.args, key, value) - trainer.train(resume_from_checkpoint=None) + torch.distributed.broadcast_object_list(trial_main_rank_list, src=0) + trainer.train(resume_from_checkpoint=None, trial=trial_main_rank_list[0]) # If there hasn't been any evaluation during the training loop. if getattr(trainer, "objective", None) is None: metrics = trainer.evaluate() diff --git a/src/transformers/integrations/peft.py b/src/transformers/integrations/peft.py index 8afff36eb08625..ef09281431169f 100644 --- a/src/transformers/integrations/peft.py +++ b/src/transformers/integrations/peft.py @@ -81,6 +81,7 @@ def load_adapter( peft_config: Dict[str, Any] = None, adapter_state_dict: Optional[Dict[str, "torch.Tensor"]] = None, low_cpu_mem_usage: bool = False, + is_trainable: bool = False, adapter_kwargs: Optional[Dict[str, Any]] = None, ) -> None: """ @@ -136,6 +137,9 @@ def load_adapter( low_cpu_mem_usage (`bool`, *optional*, defaults to `False`): Reduce memory usage while loading the PEFT adapter. This should also speed up the loading process. Requires PEFT version 0.13.0 or higher. + is_trainable (`bool`, *optional*, defaults to `False`): + Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and can only be + used for inference. adapter_kwargs (`Dict[str, Any]`, *optional*): Additional keyword arguments passed along to the `from_pretrained` method of the adapter config and `find_adapter_config_file` method. @@ -209,6 +213,7 @@ def load_adapter( token=token, **adapter_kwargs, ) + peft_config.inference_mode = not is_trainable # Create and add fresh new adapters into the model. inject_adapter_in_model(peft_config, self, adapter_name, **peft_load_kwargs) @@ -258,6 +263,9 @@ def load_adapter( if err_msg: logger.warning(err_msg) + if peft_config.inference_mode: + self.eval() + # Re-dispatch model and hooks in case the model is offloaded to CPU / Disk. if ( (getattr(self, "hf_device_map", None) is not None) @@ -381,7 +389,7 @@ def enable_adapters(self) -> None: If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT official documentation: https://huggingface.co/docs/peft - Enable adapters that are attached to the model. The model will use `self.active_adapter()` + Enable adapters that are attached to the model. """ check_peft_version(min_version=MIN_PEFT_VERSION) @@ -457,7 +465,7 @@ def get_adapter_state_dict(self, adapter_name: Optional[str] = None) -> dict: from peft import get_peft_model_state_dict if adapter_name is None: - adapter_name = self.active_adapter() + adapter_name = self.active_adapters()[0] adapter_state_dict = get_peft_model_state_dict(self, adapter_name=adapter_name) return adapter_state_dict diff --git a/src/transformers/integrations/quanto.py b/src/transformers/integrations/quanto.py index 27b32de63bfe55..1c5702321937da 100644 --- a/src/transformers/integrations/quanto.py +++ b/src/transformers/integrations/quanto.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ..utils import is_optimum_quanto_available, is_quanto_available, is_torch_available, logging +from ..utils import is_optimum_quanto_available, is_torch_available, logging if is_torch_available(): @@ -50,11 +50,6 @@ def replace_with_quanto_layers( if is_optimum_quanto_available(): from optimum.quanto import QLayerNorm, QLinear, qfloat8, qint2, qint4, qint8 - elif is_quanto_available(): - logger.warning_once( - "Importing from quanto will be deprecated in v4.47. Please install optimum-quanto instead `pip install optimum-quanto`" - ) - from quanto import QLayerNorm, QLinear, qfloat8, qint2, qint4, qint8 w_mapping = {"float8": qfloat8, "int8": qint8, "int4": qint4, "int2": qint2} a_mapping = {None: None, "float8": qfloat8, "int8": qint8} diff --git a/src/transformers/integrations/tiktoken.py b/src/transformers/integrations/tiktoken.py new file mode 100644 index 00000000000000..60f733928406c2 --- /dev/null +++ b/src/transformers/integrations/tiktoken.py @@ -0,0 +1,45 @@ +from pathlib import Path +from typing import Any + +from transformers.convert_slow_tokenizer import TikTokenConverter +from transformers.tokenization_utils_fast import TIKTOKEN_VOCAB_FILE, TOKENIZER_FILE + + +def convert_tiktoken_to_fast(encoding: Any, output_dir: str): + """ + Converts given `tiktoken` encoding to `PretrainedTokenizerFast` and saves the configuration of converted tokenizer + on disk. + + Args: + encoding (`str` or `tiktoken.Encoding`): + Tokenizer from `tiktoken` library. If `encoding` is `str`, the tokenizer will be loaded with + `tiktoken.get_encoding(encoding)`. + output_dir (`str`): + Save path for converted tokenizer configuration file. + """ + output_dir = Path(output_dir) + output_dir.mkdir(exist_ok=True) + + save_file = output_dir / "tiktoken" / TIKTOKEN_VOCAB_FILE + tokenizer_file = output_dir / TOKENIZER_FILE + + save_file_absolute = str(save_file.absolute()) + output_file_absolute = str(tokenizer_file.absolute()) + + try: + from tiktoken import get_encoding + from tiktoken.load import dump_tiktoken_bpe + + if isinstance(encoding, str): + encoding = get_encoding(encoding) + + dump_tiktoken_bpe(encoding._mergeable_ranks, save_file_absolute) + except ImportError: + raise ValueError( + "`tiktoken` is required to save a `tiktoken` file. Install it with " "`pip install tiktoken`." + ) + + tokenizer = TikTokenConverter( + vocab_file=save_file_absolute, pattern=encoding._pat_str, additional_special_tokens=encoding._special_tokens + ).tokenizer() + tokenizer.save(output_file_absolute) diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py index 045d2f6d646010..ec03ba1eb5fd83 100644 --- a/src/transformers/modeling_flash_attention_utils.py +++ b/src/transformers/modeling_flash_attention_utils.py @@ -20,7 +20,10 @@ import torch import torch.nn.functional as F -from .utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal +from .utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal, logging + + +logger = logging.get_logger(__name__) if is_flash_attn_2_available(): @@ -163,8 +166,8 @@ def prepare_fa2_from_position_ids(query, key, value, position_ids): Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value). """ query = query.view(-1, query.size(-2), query.size(-1)) - key = key.view(-1, key.size(-2), key.size(-1)) - value = value.view(-1, value.size(-2), value.size(-1)) + key = key.contiguous().view(-1, key.size(-2), key.size(-1)) + value = value.contiguous().view(-1, value.size(-2), value.size(-1)) position_ids = position_ids.flatten() indices_q = torch.arange(position_ids.size(0), device=position_ids.device, dtype=torch.int32) @@ -180,6 +183,47 @@ def prepare_fa2_from_position_ids(query, key, value, position_ids): return (query, key, value, indices_q, (cu_seq_lens, cu_seq_lens), (max_length, max_length)) +def fa_peft_integration_check( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + target_dtype: Optional[torch.dtype] = None, +): + """ + PEFT usually casts the layer norms in float32 for training stability reasons + therefore the input hidden states gets silently casted in float32. Hence, we need + cast them back in float16 / bfloat16 just to be sure everything works as expected. + This might slowdown training & inference so it is recommended to not cast the LayerNorms! + + Args: + query (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value (`torch.Tensor`): + Input value states to be passed to Flash Attention API + target_dtype (`torch.dtype`, *optional*): + The dtype to convert the attention tensors to. Conversion can be ignored by + not providing the target dtype. + """ + if target_dtype is None: + return query, key, value + + input_dtype = value.dtype + if input_dtype == torch.float32: + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query = query.to(target_dtype) + key = key.to(target_dtype) + value = value.to(target_dtype) + + return query, key, value + + flash_241 = is_flash_attn_greater_or_equal("2.4.1") deterministic_g = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1" @@ -202,6 +246,7 @@ def _flash_attention_forward( cu_seq_lens_k: Optional[torch.LongTensor] = None, max_length_q: Optional[int] = None, max_length_k: Optional[int] = None, + target_dtype: Optional[torch.dtype] = None, ): """ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token @@ -248,6 +293,11 @@ def _flash_attention_forward( if softcap is not None: flash_kwargs["softcap"] = softcap + # PEFT possibly silently casts tensors to fp32, this potentially reconverts to correct dtype or is a no op + query_states, key_states, value_states = fa_peft_integration_check( + query_states, key_states, value_states, target_dtype + ) + # Contains at least one padding token in the sequence if attention_mask is not None: batch_size = query_states.shape[0] diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py index f58bf330ce7db3..7562649be753bb 100644 --- a/src/transformers/modeling_gguf_pytorch_utils.py +++ b/src/transformers/modeling_gguf_pytorch_utils.py @@ -15,7 +15,7 @@ # limitations under the License. import re -from typing import Dict, Optional +from typing import Dict, NamedTuple, Optional import numpy as np from tqdm import tqdm @@ -55,6 +55,200 @@ GGUF_SUPPORTED_ARCHITECTURES = list(GGUF_TO_TRANSFORMERS_MAPPING["tensors"].keys()) +class GGUFTensor(NamedTuple): + weights: np.ndarray + name: str + metadata: dict + + +class TensorProcessor: + def __init__(self, config=None): + self.config = config or {} + + def process(self, weights, name, **kwargs): + return GGUFTensor(weights, name, {}) + + +class LlamaTensorProcessor(TensorProcessor): + def __init__(self, config=None): + super().__init__(config=config) + + def process(self, weights, name, **kwargs): + if ".attn_k." in name or ".attn_q." in name: + num_heads = self.config.get("num_attention_heads") + num_kv_heads = self.config.get("num_key_value_heads") + + if None in (num_heads, num_kv_heads): + return GGUFTensor(weights, name, {}) + if ".attn_q." in name: + weights = self._reverse_permute_weights(weights, num_heads, num_heads) + elif ".attn_k." in name: + weights = self._reverse_permute_weights(weights, num_heads, num_kv_heads) + return GGUFTensor(weights, name, {}) + + def _reverse_permute_weights( + self, weights: np.ndarray, n_head: int, num_kv_heads: Optional[int] = None + ) -> np.ndarray: + # Original permutation implementation + # https://github.com/ggerganov/llama.cpp/blob/a38b884c6c4b0c256583acfaaabdf556c62fabea/convert_hf_to_gguf.py#L1402-L1408 + if num_kv_heads is not None and n_head != num_kv_heads: + n_head = num_kv_heads + + dim = weights.shape[0] // n_head // 2 + w = weights.reshape(n_head, dim, 2, *weights.shape[1:]) + return w.swapaxes(2, 1).reshape(weights.shape) + + +class Qwen2MoeTensorProcessor(TensorProcessor): + def __init__(self, config=None): + super().__init__(config=config) + + def process(self, weights, name, **kwargs): + if "_exp" in name: + tensor_key_mapping = kwargs.get("tensor_key_mapping") + parsed_parameters = kwargs.get("parsed_parameters") + if tensor_key_mapping: + self._split_moe_expert_tensor(weights, parsed_parameters, name, tensor_key_mapping) + return GGUFTensor(weights, None, {}) + if "ffn_gate_inp_shexp" in name: + # for compatibility tensor shared_expert_gate must be (1, 2048) dim, + # quantized one is (2048) + weights = np.expand_dims(weights, axis=0) + return GGUFTensor(weights, name, {}) + + def _split_moe_expert_tensor( + self, weights: np.ndarray, parsed_parameters: Dict[str, Dict], name: str, tensor_key_mapping: dict + ): + # Original merge implementation + # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L1994-L2022 + exp_name = "" + if "ffn_gate_exps" in name: + exp_name = "gate_proj" + elif "ffn_down_exps" in name: + exp_name = "down_proj" + elif "ffn_up_exps" in name: + exp_name = "up_proj" + else: + raise ValueError(f"Cannot map expert tensor {name} in Qwen2Moe architecture.") + for tensor_name in tensor_key_mapping: + if tensor_name in name: + name = name.replace(tensor_name, tensor_key_mapping[tensor_name]) + w_counter = self.config.get("num_experts", 60) + for i in range(0, w_counter): + temp_name = name.replace(".weight", f".{i}.{exp_name}.weight") + exp_weight = weights[i] + parsed_parameters["tensors"][temp_name] = torch.from_numpy(np.copy(exp_weight)) + + +class BloomTensorProcessor(TensorProcessor): + def __init__(self, config=None): + super().__init__(config=config) + + def process(self, weights, name, **kwargs): + if "attn_qkv" in name: + num_heads = self.config["n_head"] + n_embed = self.config["hidden_size"] + if "weight" in name: + weights = self._reverse_reshape_weights(weights, num_heads, n_embed) + else: + weights = self._reverse_reshape_bias(weights, num_heads, n_embed) + return GGUFTensor(weights, name, {}) + + def _reverse_reshape_weights(self, weights: np.ndarray, n_head: int, n_embed: int): + # Original reshape implementation + # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L972-L985 + q, k, v = np.array_split(weights, 3, axis=0) + + q = q.reshape(n_head, n_embed // n_head, n_embed) + k = k.reshape(n_head, n_embed // n_head, n_embed) + v = v.reshape(n_head, n_embed // n_head, n_embed) + qkv_weights = np.stack([q, k, v], axis=1) + + return qkv_weights.reshape(n_head * 3 * (n_embed // n_head), n_embed) + + def _reverse_reshape_bias(self, weights: np.ndarray, n_head: int, n_embed: int): + # Original reshape implementation + # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L986-L998 + q_bias, k_bias, v_bias = np.array_split(weights, 3) + + q_bias = q_bias.reshape(n_head, n_embed // n_head) + k_bias = k_bias.reshape(n_head, n_embed // n_head) + v_bias = v_bias.reshape(n_head, n_embed // n_head) + + qkv_bias = np.stack([q_bias, k_bias, v_bias], axis=1).flatten() + return qkv_bias + + +class T5TensorProcessor(TensorProcessor): + def __init__(self, config=None): + super().__init__(config=config) + + def process(self, weights, name, **kwargs): + bid = None + for chunk in name.split("."): + if chunk.isdigit(): + bid = int(chunk) + break + return GGUFTensor(weights, name, {"bid": bid}) + + +class GPT2TensorProcessor(TensorProcessor): + def __init__(self, config=None): + super().__init__(config=config) + + def process(self, weights, name, **kwargs): + # Original transpose implementation + # https://github.com/ggerganov/llama.cpp/blob/a38b884c6c4b0c256583acfaaabdf556c62fabea/convert_hf_to_gguf.py#L2060-L2061 + if ( + "attn_qkv.weight" in name + or "ffn_down.weight" in name + or "ffn_up.weight" in name + or "attn_output.weight" in name + ): + weights = weights.T + + # Handle special case for output.weight + if name == "output.weight": + # output.weight has conflicts with attn_output.weight in name checking + # Store the tensor directly and signal to skip further processing + name = "lm_head.weight" + parsed_parameters = kwargs.get("parsed_parameters", {}) + parsed_parameters["tensors"][name] = torch.from_numpy(np.copy(weights)) + name = None # Signal to skip further processing + return GGUFTensor(weights, name, {}) + + +class MambaTensorProcessor(TensorProcessor): + def __init__(self, config=None): + super().__init__(config=config) + + def process(self, weights, name, **kwargs): + if "ssm_d" in name and "bias" not in name and "weight" not in name: + # ssm_d has conflicts with ssm_dt in name checking + # we have to explicitly check that name is exactly ssm_d + name = name.replace("ssm_d", "mixer.D") + if "ssm_conv1d.weight" in name: + # for compatibility tensor ssm_conv1d must be (5120, 1, 4]) dim, + # quantized one is (5120, 4) + weights = np.expand_dims(weights, axis=1) + if "ssm_a" in name: + # Original exponential implementation + # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L2975-L2977 + weights = np.log(-weights) + return GGUFTensor(weights, name, {}) + + +TENSOR_PROCESSORS = { + "llama": LlamaTensorProcessor, + "qwen2moe": Qwen2MoeTensorProcessor, + "bloom": BloomTensorProcessor, + "t5": T5TensorProcessor, + "t5encoder": T5TensorProcessor, + "gpt2": GPT2TensorProcessor, + "mamba": MambaTensorProcessor, +} + + def read_field(reader, field): value = reader.fields[field] return [_gguf_parse_value(value.parts[_data_index], value.types) for _data_index in value.data] @@ -97,7 +291,6 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): # FIXME: Currnetly this implementation is only for flan-t5 architecture. # It needs to be developed for supporting legacy t5. elif "t5" in architecture or "t5encoder" in architecture: - parsed_parameters["config"]["tie_word_embeddings"] = False parsed_parameters["config"]["is_gated_act"] = True updated_architecture = "t5" else: @@ -132,6 +325,12 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): if architecture + model_size not in GGUF_SUPPORTED_ARCHITECTURES: raise ValueError(f"Architecture {architecture + model_size} not supported") + # Handle tie_word_embeddings, if lm_head.weight is not present in tensors, + # tie_word_embeddings is true otherwise false + parsed_parameters["config"]["tie_word_embeddings"] = all( + "output.weight" != tensor.name for tensor in reader.tensors + ) + # List all key-value pairs in a columnized format for gguf_key, field in reader.fields.items(): gguf_key = gguf_key.replace(architecture, updated_architecture) @@ -177,73 +376,28 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): if return_tensors: tensor_key_mapping = GGUF_TO_TRANSFORMERS_MAPPING["tensors"][architecture + model_size] + config = parsed_parameters.get("config", {}) + + ProcessorClass = TENSOR_PROCESSORS.get(architecture, TensorProcessor) + processor = ProcessorClass(config=config) for tensor in tqdm(reader.tensors, desc="Converting and de-quantizing GGUF tensors..."): name = tensor.name - weights = dequantize(tensor.data, tensor.tensor_type) - if architecture == "llama" and (".attn_k." in name or ".attn_q." in name): - num_heads = parsed_parameters["config"]["num_attention_heads"] - num_kv_heads = parsed_parameters["config"]["num_key_value_heads"] - if ".attn_q." in name: - weights = reverse_permute_weights(weights, num_heads, num_heads) - elif ".attn_k." in name: - weights = reverse_permute_weights(weights, num_heads, num_kv_heads) - - if architecture == "qwen2moe": - if "_exp" in name: - split_moe_expert_tensor(weights, parsed_parameters, name, tensor_key_mapping) - continue - if "ffn_gate_inp_shexp" in name: - # for compatibility tensor shared_expert_gate must be (1, 2048) dim, - # quantized one is (2048) - weights = np.expand_dims(weights, axis=0) - - if architecture == "bloom" and "attn_qkv" in name: - num_heads = parsed_parameters["config"]["n_head"] - n_embed = parsed_parameters["config"]["hidden_size"] - if "weight" in name: - weights = reverse_reshape_weights(weights, num_heads, n_embed) - else: - weights = reverse_reshape_bias(weights, num_heads, n_embed) - - bid = None - if architecture in ("t5", "t5encoder"): - for chunk in name.split("."): - if chunk.isdigit(): - bid = int(chunk) - break - - if architecture == "gpt2": - if ( - "attn_qkv.weight" in name - or "ffn_down.weight" in name - or "ffn_up.weight" in name - or "attn_output.weight" in name - ): - # Original transpose implementation - # https://github.com/ggerganov/llama.cpp/blob/a38b884c6c4b0c256583acfaaabdf556c62fabea/convert_hf_to_gguf.py#L2060-L2061 - weights = weights.T - if name == "output.weight": - # output.weight has conflicts with attn_output.weight in name checking - # we have to explicitly check that name is exactly output.weight - name = "lm_head.weight" - parsed_parameters["tensors"][name] = torch.from_numpy(np.copy(weights)) - continue - if architecture == "mamba": - if "ssm_d" in name and "bias" not in name and "weight" not in name: - # ssm_d has conflicts with ssm_dt in name checking - # we have to explicitly check that name is exactly ssm_d - name = name.replace("ssm_d", "mixer.D") - if "ssm_conv1d.weight" in name: - # for compatibility tensor ssm_conv1d must be (5120, 1, 4]) dim, - # quantized one is (5120, 4) - weights = np.expand_dims(weights, axis=1) - if "ssm_a" in name: - # Original exponential implementation - # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L2975-L2977 - weights = np.log(-weights) + result = processor.process( + weights=weights, + name=name, + tensor_key_mapping=tensor_key_mapping, + parsed_parameters=parsed_parameters, + ) + + weights = result.weights + name = result.name + bid = result.metadata.get("bid") + + if name is None: + continue for tensor_name in tensor_key_mapping: if tensor_name.format(bid=bid) in name: @@ -256,64 +410,3 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}") return parsed_parameters - - -def reverse_permute_weights(weights: np.ndarray, n_head: int, num_kv_heads: Optional[int] = None) -> np.ndarray: - # Original permutation implementation - # https://github.com/ggerganov/llama.cpp/blob/a38b884c6c4b0c256583acfaaabdf556c62fabea/convert_hf_to_gguf.py#L1402-L1408 - if num_kv_heads is not None and n_head != num_kv_heads: - n_head = num_kv_heads - - dim = weights.shape[0] // n_head // 2 - w = weights.reshape(n_head, dim, 2, *weights.shape[1:]) - return w.swapaxes(2, 1).reshape(weights.shape) - - -def reverse_reshape_weights(weights: np.ndarray, n_head: int, n_embed: int): - # Original reshape implementation - # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L972-L985 - q, k, v = np.array_split(weights, 3, axis=0) - - q = q.reshape(n_head, n_embed // n_head, n_embed) - k = k.reshape(n_head, n_embed // n_head, n_embed) - v = v.reshape(n_head, n_embed // n_head, n_embed) - qkv_weights = np.stack([q, k, v], axis=1) - - return qkv_weights.reshape(n_head * 3 * (n_embed // n_head), n_embed) - - -def reverse_reshape_bias(weights: np.ndarray, n_head: int, n_embed: int): - # Original reshape implementation - # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L986-L998 - q_bias, k_bias, v_bias = np.array_split(weights, 3) - - q_bias = q_bias.reshape(n_head, n_embed // n_head) - k_bias = k_bias.reshape(n_head, n_embed // n_head) - v_bias = v_bias.reshape(n_head, n_embed // n_head) - - qkv_bias = np.stack([q_bias, k_bias, v_bias], axis=1).flatten() - return qkv_bias - - -def split_moe_expert_tensor( - weights: np.ndarray, parsed_parameters: Dict[str, Dict], name: str, tensor_key_mapping: dict -): - # Original merge implementation - # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L1994-L2022 - exp_name = "" - if "ffn_gate_exps" in name: - exp_name = "gate_proj" - elif "ffn_down_exps" in name: - exp_name = "down_proj" - elif "ffn_up_exps" in name: - exp_name = "up_proj" - else: - raise ValueError(f"Cannot map expert tensor {name} in Qwen2Moe architecture.") - for tensor_name in tensor_key_mapping: - if tensor_name in name: - name = name.replace(tensor_name, tensor_key_mapping[tensor_name]) - w_counter = parsed_parameters["config"].get("num_experts", 60) - for i in range(0, w_counter): - temp_name = name.replace(".weight", f".{i}.{exp_name}.weight") - exp_weight = weights[i] - parsed_parameters["tensors"][temp_name] = torch.from_numpy(np.copy(exp_weight)) diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 7d62fe97007e32..8264f48818cb38 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -3160,7 +3160,7 @@ def push_to_hub( commit_message (`str`, *optional*): Message to commit while pushing. Will default to `"Upload model"`. private (`bool`, *optional*): - Whether or not the repository created should be private. + Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists. token (`bool` or `str`, *optional*): The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url` diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 71555e2f7c3162..dae29111c8dcc0 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -29,8 +29,8 @@ from contextlib import contextmanager from dataclasses import dataclass from functools import partial, wraps -from threading import Thread -from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union +from multiprocessing import Process +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, TypeVar, Union from zipfile import is_zipfile import torch @@ -43,7 +43,7 @@ from .activations import get_activation from .configuration_utils import PretrainedConfig from .dynamic_module_utils import custom_object_save -from .generation import GenerationConfig, GenerationMixin +from .generation import CompileConfig, GenerationConfig, GenerationMixin from .integrations import PeftAdapterMixin, deepspeed_config, is_deepspeed_zero3_enabled from .loss.loss_utils import LOSS_MAPPING from .pytorch_utils import ( # noqa: F401 @@ -55,6 +55,7 @@ prune_conv1d_layer, prune_layer, prune_linear_layer, + translate_to_torch_parallel_style, ) from .quantizers import AutoHfQuantizer, HfQuantizer from .quantizers.quantizers_utils import get_module_from_name @@ -88,6 +89,8 @@ is_peft_available, is_remote_url, is_safetensors_available, + is_torch_flex_attn_available, + is_torch_greater_or_equal, is_torch_sdpa_available, is_torch_xla_available, logging, @@ -137,6 +140,7 @@ _init_weights = True _is_quantized = False +_is_ds_init_called = False def is_fsdp_enabled(): @@ -167,6 +171,10 @@ def is_local_dist_rank_0(): if is_peft_available(): from .utils import find_adapter_config_file + +SpecificPreTrainedModelType = TypeVar("SpecificPreTrainedModelType", bound="PreTrainedModel") + + TORCH_INIT_FUNCTIONS = { "uniform_": nn.init.uniform_, "normal_": nn.init.normal_, @@ -224,6 +232,19 @@ def set_quantized_state(): _is_quantized = False +# Skip recursive calls to deepspeed.zero.Init to avoid pinning errors. +# This issue occurs with ZeRO stage 3 when using NVMe offloading. +# For more details, refer to issue #34429. +@contextmanager +def set_zero3_state(): + global _is_ds_init_called + _is_ds_init_called = True + try: + yield + finally: + _is_ds_init_called = False + + def get_parameter_device(parameter: Union[nn.Module, "ModuleUtilsMixin"]): try: return next(parameter.parameters()).device @@ -359,6 +380,9 @@ def check_support_param_buffer_assignment(model_to_load, state_dict, start_prefi Note: We fully disable this if we are using `deepspeed` """ + if model_to_load.device.type == "meta": + return False + if len([key for key in state_dict if key.startswith(start_prefix)]) == 0: return False @@ -373,7 +397,7 @@ def check_support_param_buffer_assignment(model_to_load, state_dict, start_prefi return False # If the model does, the incoming `state_dict` and the `model_to_load` must be the same dtype - first_key = list(model_to_load.state_dict().keys())[0] + first_key = next(iter(model_to_load.state_dict().keys())) if start_prefix + first_key in state_dict: return state_dict[start_prefix + first_key].dtype == model_to_load.state_dict()[first_key].dtype @@ -927,7 +951,10 @@ def _load_state_dict_into_meta_model( param_to = "cpu" if is_fsdp_enabled() and not is_local_dist_rank_0(): param_to = "meta" - value = type(value)(value.data.to(param_to), **value.__dict__) + val_kwargs = {} + if hasattr(module, "weight") and module.weight.__class__.__name__ == "Int8Params": + val_kwargs["requires_grad"] = False + value = type(value)(value.data.to(param_to), **val_kwargs, **value.__dict__) setattr(module, tensor_name, value) # TODO: consider removing used param_parts from state_dict before return @@ -1316,6 +1343,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix # SDPA support _supports_sdpa = False + # Flex Attention support + _supports_flex_attn = False + # Has support for a `Cache` instance as `past_key_values`? Does it support a `StaticCache`? _supports_cache_class = False _supports_static_cache = False @@ -1323,6 +1353,12 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix # Has support for a `QuantoQuantizedCache` instance as `past_key_values` _supports_quantized_cache = False + # A tensor parallel plan to be applied to the model when TP is enabled. For + # top-level models, this attribute is currently defined in respective model + # code. For base models, this attribute comes from + # `config.base_model_tp_plan` during `post_init`. + _tp_plan = None + @property def dummy_inputs(self) -> Dict[str, torch.Tensor]: """ @@ -1367,6 +1403,9 @@ def post_init(self): """ self.init_weights() self._backward_compatibility_gradient_checkpointing() + # If current model is a base model, attach `base_model_tp_plan` from config + if self.base_model is self: + self._tp_plan = self.config.base_model_tp_plan def dequantize(self): """ @@ -1456,13 +1495,14 @@ def _from_config(cls, config, **kwargs): torch_dtype=torch_dtype, ) - if is_deepspeed_zero3_enabled() and not _is_quantized: + if is_deepspeed_zero3_enabled() and not _is_quantized and not _is_ds_init_called: import deepspeed logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model") # this immediately partitions the model across all gpus, to avoid the overhead in time # and memory copying it on CPU or each GPU first - with deepspeed.zero.Init(config_dict_or_path=deepspeed_config()): + init_contexts = [deepspeed.zero.Init(config_dict_or_path=deepspeed_config()), set_zero3_state()] + with ContextManagers(init_contexts): model = cls(config, **kwargs) else: @@ -1505,12 +1545,17 @@ def _autoset_attn_implementation( "eager", "sdpa", "flash_attention_2", + "flex_attention", ]: message = f'Specified `attn_implementation="{config._attn_implementation}"` is not supported. The only possible arguments are `attn_implementation="eager"` (manual attention implementation)' if cls._supports_flash_attn_2: message += ', `"attn_implementation=flash_attention_2"` (implementation using flash attention 2)' if cls._supports_sdpa: message += ', `"attn_implementation=sdpa"` (implementation using torch.nn.functional.scaled_dot_product_attention)' + if cls._supports_flex_attn: + message += ( + ', `"attn_implementation=flex_attention"` (implementation using torch\'s flex_attention)' + ) raise ValueError(message + ".") # If a config is passed with a preset attn_implementation, we skip the automatic dispatch and use the user-provided config, with hard checks that the requested attention implementation is available. @@ -1545,6 +1590,8 @@ def _autoset_attn_implementation( hard_check_only=False, check_device_map=check_device_map, ) + elif requested_attn_implementation == "flex_attention": + config = cls._check_and_enable_flex_attn(config, hard_check_only=True) elif requested_attn_implementation in [None, "sdpa"] and not is_torch_xla_available(): # use_flash_attention_2 takes priority over SDPA, hence SDPA treated in this elif. config = cls._check_and_enable_sdpa( @@ -1741,7 +1788,7 @@ def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False) -> Pretra """ Checks the availability of SDPA for a given model. - If all checks pass and `hard_check_only` is False, the method will set the config attribute `_attn_implementation` to "flash_attention_2" so that the model can initialize the correct attention module. + If all checks pass and `hard_check_only` is False, the method will set the config attribute `_attn_implementation` to "sdpa" so that the model can initialize the correct attention module. """ if hard_check_only: if not cls._supports_sdpa: @@ -1766,6 +1813,35 @@ def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False) -> Pretra config._attn_implementation = "sdpa" return config + @classmethod + def _check_and_enable_flex_attn(cls, config, hard_check_only: bool = False) -> PretrainedConfig: + """ + Checks the availability of Flex Attention for a given model. + + If all checks pass and `hard_check_only` is False, the method will set the config attribute `_attn_implementation` to "flex_attention" so that the model can initialize the correct attention module. + """ + if hard_check_only: + if not cls._supports_flex_attn: + raise ValueError( + f"{cls.__name__} does not support an attention implementation through torch's flex_attention." + " Please request the support for this architecture: https://github.com/huggingface/transformers/issues/34809." + " If you believe this error is a bug, please open an issue in Transformers GitHub repository" + ' and load your model with the argument `attn_implementation="eager"` meanwhile.' + ' Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")`' + ) + if not is_torch_flex_attn_available(): + raise ImportError( + "PyTorch Flex Attention requirements in Transformers are not met. Please install torch>=2.5.0." + ) + + if not is_torch_flex_attn_available() or not cls._supports_flex_attn: + return config + + if not hard_check_only: + config._attn_implementation = "flex_attention" + + return config + def enable_input_require_grads(self): """ Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping @@ -2927,7 +3003,12 @@ def save_pretrained( if module_map: filename_to_tensors = logging.tqdm(filename_to_tensors, desc="Saving checkpoint shards") for shard_file, tensors in filename_to_tensors: - shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors} + shard = {} + for tensor in tensors: + shard[tensor] = state_dict[tensor].contiguous() + # delete reference, see https://github.com/huggingface/transformers/pull/34890 + del state_dict[tensor] + # remake shard with onloaded parameters if necessary if module_map: if accelerate_version < version.parse("0.31"): @@ -2954,6 +3035,8 @@ def save_pretrained( else: save_function(shard, os.path.join(save_directory, shard_file)) + del state_dict + if index is None: path_to_weights = os.path.join(save_directory, weights_name) logger.info(f"Model weights saved in {path_to_weights}") @@ -3102,7 +3185,7 @@ def float(self, *args): @classmethod def from_pretrained( - cls, + cls: Type[SpecificPreTrainedModelType], pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None, @@ -3112,10 +3195,10 @@ def from_pretrained( local_files_only: bool = False, token: Optional[Union[str, bool]] = None, revision: str = "main", - use_safetensors: bool = None, + use_safetensors: Optional[bool] = None, weights_only: bool = True, **kwargs, - ) -> "PreTrainedModel": + ) -> SpecificPreTrainedModelType: r""" Instantiate a pretrained pytorch model from a pre-trained model configuration. @@ -3396,6 +3479,11 @@ def from_pretrained( # Cache path to the GGUF file gguf_path = None + tp_plan = kwargs.pop("tp_plan", None) + if tp_plan is not None and tp_plan != "auto": + # TODO: we can relax this check when we support taking tp_plan from a json file, for example. + raise ValueError(f"tp_plan supports 'auto' only for now but got {tp_plan}.") + if is_fsdp_enabled(): low_cpu_mem_usage = True @@ -3579,7 +3667,11 @@ def from_pretrained( if hf_quantizer is not None: hf_quantizer.validate_environment( - torch_dtype=torch_dtype, from_tf=from_tf, from_flax=from_flax, device_map=device_map + torch_dtype=torch_dtype, + from_tf=from_tf, + from_flax=from_flax, + device_map=device_map, + weights_only=weights_only, ) torch_dtype = hf_quantizer.update_torch_dtype(torch_dtype) device_map = hf_quantizer.update_device_map(device_map) @@ -3797,11 +3889,11 @@ def from_pretrained( **has_file_kwargs, } if not has_file(pretrained_model_name_or_path, safe_weights_name, **has_file_kwargs): - Thread( + Process( target=auto_conversion, args=(pretrained_model_name_or_path,), kwargs={"ignore_errors_during_conversion": True, **cached_file_kwargs}, - name="Thread-autoconversion", + name="Process-auto_conversion", ).start() else: # Otherwise, no PyTorch file was found, maybe there is a TF or Flax model file. @@ -3997,18 +4089,32 @@ def from_pretrained( # Instantiate model. init_contexts = [no_init_weights(_enable=_fast_init)] + tp_device = None - if is_deepspeed_zero3_enabled() and not is_quantized: + if is_deepspeed_zero3_enabled() and not is_quantized and not _is_ds_init_called: import deepspeed logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model") - init_contexts = [deepspeed.zero.Init(config_dict_or_path=deepspeed_config())] + init_contexts + init_contexts = [ + deepspeed.zero.Init(config_dict_or_path=deepspeed_config()), + set_zero3_state(), + ] + init_contexts elif low_cpu_mem_usage: if not is_accelerate_available(): raise ImportError( f"Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`" ) init_contexts.append(init_empty_weights()) + elif tp_plan is not None: + if not torch.distributed.is_initialized(): + raise ValueError("Tensor Parallel requires torch.distributed to be initialized first.") + + # Detect the accelerator on the machine. If no accelerator is available, it returns CPU. + device_type = torch._C._get_accelerator().type + device_module = torch.get_device_module(device_type) + # Get device with index assuming equal number of devices per host + tp_device = torch.device(device_type, torch.distributed.get_rank() % device_module.device_count()) + init_contexts.append(tp_device) if is_deepspeed_zero3_enabled() and is_quantized: init_contexts.append(set_quantized_state()) @@ -4142,32 +4248,38 @@ def from_pretrained( if dtype_orig is not None: torch.set_default_dtype(dtype_orig) - ( - model, - missing_keys, - unexpected_keys, - mismatched_keys, - offload_index, - error_msgs, - ) = cls._load_pretrained_model( - model, - state_dict, - loaded_state_dict_keys, # XXX: rename? - resolved_archive_file, - pretrained_model_name_or_path, - ignore_mismatched_sizes=ignore_mismatched_sizes, - sharded_metadata=sharded_metadata, - _fast_init=_fast_init, - low_cpu_mem_usage=low_cpu_mem_usage, - device_map=device_map, - offload_folder=offload_folder, - offload_state_dict=offload_state_dict, - dtype=torch_dtype, - hf_quantizer=hf_quantizer, - keep_in_fp32_modules=keep_in_fp32_modules, - gguf_path=gguf_path, - weights_only=weights_only, - ) + load_contexts = [] + # Make sure we load onto targeted device + if tp_device is not None: + load_contexts.append(tp_device) + + with ContextManagers(load_contexts): + ( + model, + missing_keys, + unexpected_keys, + mismatched_keys, + offload_index, + error_msgs, + ) = cls._load_pretrained_model( + model, + state_dict, + loaded_state_dict_keys, # XXX: rename? + resolved_archive_file, + pretrained_model_name_or_path, + ignore_mismatched_sizes=ignore_mismatched_sizes, + sharded_metadata=sharded_metadata, + _fast_init=_fast_init, + low_cpu_mem_usage=low_cpu_mem_usage, + device_map=device_map, + offload_folder=offload_folder, + offload_state_dict=offload_state_dict, + dtype=torch_dtype, + hf_quantizer=hf_quantizer, + keep_in_fp32_modules=keep_in_fp32_modules, + gguf_path=gguf_path, + weights_only=weights_only, + ) # make sure token embedding weights are still tied if needed model.tie_weights() @@ -4251,6 +4363,16 @@ def from_pretrained( } return model, loading_info + if tp_plan is not None: + assert tp_device is not None, "tp_device not set!" + if not model.supports_tp_plan: + raise NotImplementedError("This model does not have a tensor parallel plan.") + # Assuming sharding the model onto the world + world_size = torch.distributed.get_world_size() + device_mesh = torch.distributed.init_device_mesh(tp_device.type, (world_size,)) + # Apply Tensor Parallelism + model.tensor_parallel(device_mesh) + return model @classmethod @@ -4940,6 +5062,56 @@ def _is_quantized_training_enabled(self): return self.hf_quantizer.is_trainable + @property + def supports_tp_plan(self): + """ + Returns whether the model has a tensor parallelism plan. + """ + if self._tp_plan is not None: + return True + # Check if base model has a TP plan + if getattr(self.base_model, "_tp_plan", None) is not None: + return True + return False + + def tensor_parallel(self, device_mesh): + """ + Tensor parallelize the model across the given device mesh. + + Args: + device_mesh (`torch.distributed.DeviceMesh`): + The device mesh to use for tensor parallelism. + """ + if not is_torch_greater_or_equal("2.5"): + raise EnvironmentError("tensor parallel is only supported for `torch>=2.5`.") + + # Tensor parallelize a nn.Module based on the `_tp_plan` attribute of the module. + # No op if `_tp_plan` attribute does not exist under the module. + # This is a helper function to be used with `model.apply` to recursively + # parallelize a model. + def tplize(mod: torch.nn.Module) -> None: + tp_plan = getattr(mod, "_tp_plan", None) + if tp_plan is None: + return + logger.debug(f"Applying tensor parallel to {mod.__class__.__name__}: {tp_plan}") + # In model configs, we use a neutral type (string) to specify + # parallel styles, here we translate them into torch TP types. + # Using tree_map because `tp_plan` is a dict. + tp_plan = torch.utils._pytree.tree_map( + translate_to_torch_parallel_style, + tp_plan, + ) + # Apply TP to current module. + torch.distributed.tensor.parallel.parallelize_module( + mod, + device_mesh=device_mesh, + parallelize_plan=tp_plan, + ) + + # `apply` is a native method of `nn.Module` that recursively applies a + # function to every submodule. + self.apply(tplize) + @property def loss_function(self): if getattr(self.config, "loss_type", None) is not None: @@ -4961,6 +5133,21 @@ def loss_function(self): loss_type = "ForCausalLM" return LOSS_MAPPING[loss_type] + def get_compiled_call(self, compile_config: CompileConfig): + """Return a `torch.compile`'d version of `self.__call__`. This is useful to dynamically choose between + non-compiled/compiled `forward` during inference, especially to switch between prefill (where we don't + want to use compiled version to avoid recomputing the graph with new shapes) and iterative decoding + (where we want the speed-ups of compiled version with static shapes).""" + # Only reset it if not present or different from previous config + default_config = getattr(self.generation_config, "compile_config", CompileConfig()) + if ( + not hasattr(self, "_compiled_call") + or getattr(self, "_last_compile_config", default_config) != compile_config + ): + self._last_compile_config = compile_config + self._compiled_call = torch.compile(self.__call__, **compile_config.to_dict()) + return self._compiled_call + PreTrainedModel.push_to_hub = copy_func(PreTrainedModel.push_to_hub) if PreTrainedModel.push_to_hub.__doc__ is not None: diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 0d4b9f2f94de9b..116b71c81ad9df 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -16,6 +16,7 @@ albert, align, altclip, + aria, audio_spectrogram_transformer, auto, autoformer, @@ -117,6 +118,7 @@ idefics, idefics2, idefics3, + ijepa, imagegpt, informer, instructblip, @@ -177,7 +179,7 @@ nougat, nystromformer, olmo, - olmo_1124, + olmo2, olmoe, omdet_turbo, oneformer, diff --git a/src/transformers/models/aria/__init__.py b/src/transformers/models/aria/__init__.py new file mode 100644 index 00000000000000..f73301321527c1 --- /dev/null +++ b/src/transformers/models/aria/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_aria import * + from .image_processing_aria import * + from .modeling_aria import * + from .processing_aria import * + +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/aria/configuration_aria.py b/src/transformers/models/aria/configuration_aria.py new file mode 100644 index 00000000000000..ff34d59f5dfe1a --- /dev/null +++ b/src/transformers/models/aria/configuration_aria.py @@ -0,0 +1,299 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/aria/modular_aria.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_aria.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 The Rhymes-AI Teams Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict + +from ...configuration_utils import PretrainedConfig +from ...modeling_rope_utils import rope_config_validation +from ..auto import CONFIG_MAPPING, AutoConfig + + +class AriaTextConfig(PretrainedConfig): + r""" + This class handles the configuration for the text component of the Aria model. + Instantiating a configuration with the defaults will yield a similar configuration to that of the model of the Aria + [rhymes-ai/Aria](https://huggingface.co/rhymes-ai/Aria) architecture. + This class extends the LlamaConfig to include additional parameters specific to the Mixture of Experts (MoE) architecture. + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`LlamaModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 4096): + The size of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens, + Llama 2 up to 4096, CodeLlama up to 16384. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*, defaults to 2): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + pretraining_tp (`int`, *optional*, defaults to 1): + Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this + document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to + understand more about it. This value is necessary to ensure exact reproducibility of the pretraining + results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232). + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers. + head_dim (`int`, *optional*): + The attention head dimension. If None, it will default to hidden_size // num_heads + moe_num_experts (`int`, *optional*, defaults to 8): + The number of experts in the MoE layer. + moe_topk (`int`, *optional*, defaults to 2): + The number of top experts to route to for each token. + moe_num_shared_experts (`int`, *optional*, defaults to 2): + The number of shared experts. + """ + + model_type = "aria_text" + keys_to_ignore_at_inference = ["past_key_values"] + # Default tensor parallel plan for base model `AriaTextModel` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_config_key = "text_config" + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size: int = 4096, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=2, + bos_token_id=1, + eos_token_id=2, + pretraining_tp=1, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + mlp_bias=False, + head_dim=None, + moe_num_experts: int = 8, + moe_topk: int = 2, + moe_num_shared_experts: int = 2, + **kwargs, + ): + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.pretraining_tp = pretraining_tp + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.mlp_bias = mlp_bias + self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads + # Validate the correctness of rotary position embeddings parameters + # BC: if there is a 'type' field, copy it it to 'rope_type'. + if self.rope_scaling is not None and "type" in self.rope_scaling: + self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_config_validation(self) + self.moe_num_experts = moe_num_experts + self.moe_topk = moe_topk + self.moe_num_shared_experts = moe_num_shared_experts + + +class AriaConfig(PretrainedConfig): + r""" + This class handles the configuration for both vision and text components of the Aria model, + as well as additional parameters for image token handling and projector mapping. + Instantiating a configuration with the defaults will yield a similar configuration to that of the model of the Aria + [rhymes-ai/Aria](https://huggingface.co/rhymes-ai/Aria) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`AriaVisionConfig` or `dict`, *optional*): + Configuration for the vision component. + vision_feature_layer (`int`, *optional*, defaults to -1): + The index of the layer to select the vision feature. + text_config (`AriaTextConfig` or `dict`, *optional*): + Configuration for the text component. + projector_patch_to_query_dict (`dict`, *optional*): + Mapping of patch sizes to query dimensions. + image_token_index (`int`, *optional*, defaults to 9): + Index used to represent image tokens. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated normal initializer for initializing all weight matrices. + + Attributes: + model_type (`str`): + Type of the model, set to `"aria"`. + image_token_index (`int`): + Index used to represent image tokens. + projector_patch_to_query_dict (`dict`): + Mapping of patch sizes to query dimensions. + vision_config (`AriaVisionConfig`): + Configuration for the vision component. + text_config (`AriaTextConfig`): + Configuration for the text component. + """ + + model_type = "aria" + sub_configs = {"text_config": AriaTextConfig, "vision_config": AutoConfig} + + def __init__( + self, + vision_config=None, + vision_feature_layer: int = -1, + text_config: AriaTextConfig = None, + projector_patch_to_query_dict: Dict = None, + image_token_index: int = 9, + initializer_range: float = 0.02, + **kwargs, + ): + self.image_token_index = image_token_index + + # Convert the keys and values of projector_patch_to_query_dict to integers + # This ensures consistency even if they were provided as strings + if projector_patch_to_query_dict is None: + projector_patch_to_query_dict = { + 1225: 128, + 4900: 256, + } + self.projector_patch_to_query_dict = {int(k): int(v) for k, v in projector_patch_to_query_dict.items()} + self.max_value_projector_patch_to_query_dict = max(self.projector_patch_to_query_dict.values()) + self.vision_feature_layer = vision_feature_layer + if isinstance(vision_config, dict): + vision_config["model_type"] = "idefics3_vision" + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + elif vision_config is None: + vision_config = CONFIG_MAPPING["idefics3_vision"]() + + self.vision_config = vision_config + self.initializer_range = initializer_range + + if isinstance(text_config, dict) and "model_type" in text_config: + text_config = AriaTextConfig(**text_config) + elif text_config is None: + text_config = AriaTextConfig() + + self.text_config = text_config + + super().__init__(**kwargs) + + +__all__ = ["AriaConfig", "AriaTextConfig"] diff --git a/src/transformers/models/aria/convert_aria_weights_to_hf.py b/src/transformers/models/aria/convert_aria_weights_to_hf.py new file mode 100644 index 00000000000000..dcc9e4d1397672 --- /dev/null +++ b/src/transformers/models/aria/convert_aria_weights_to_hf.py @@ -0,0 +1,162 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import glob + +import torch +from huggingface_hub import snapshot_download +from safetensors import safe_open + +from transformers import ( + AddedToken, + AriaForConditionalGeneration, + AriaProcessor, + AutoConfig, + AutoTokenizer, +) + + +EPILOG_TXT = """Example: + python transformers/src/transformers/models/aria/convert_aria_weights_to_hf.py --text_model_id rhymes-ai/Aria --vision_model_id rhymes-ai/Aria --output_hub_path m-ric/Aria_hf_2 --old_state_dict_id rhymes-ai/Aria + +Example for creating the old state dict file with Python: + + import torch + from aria.model.language_model.aria_llama import AriaTextForCausalLM + + # load model + kwargs = {"device_map": "auto", "torch_dtype": torch.float16} + model = AriaTextForCausalLM.from_pretrained("rhymes-ai/Aria", low_cpu_mem_usage=True, **kwargs) + + # load vision tower + model.get_vision_tower().load_model() + + # Save state dict + torch.save(model.state_dict(), "tmp/hf_models/aria/model_state_dict.bin") +""" + +KEYS_TO_MODIFY_MAPPING = { + "vision_tower.vision_model": "vision_tower", + "ln_ffn": "layer_norm", + "ffn": "feed_forward", + "ln_kv": "layer_norm_kv", +} + + +def load_original_state_dict(model_id): + directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"]) + + original_state_dict = {} + for path in glob.glob(f"{directory_path}/*"): + if path.endswith(".safetensors"): + with safe_open(path, framework="pt", device="cpu") as f: + for key in f.keys(): + original_state_dict[key] = f.get_tensor(key) + + return original_state_dict + + +def convert_state_dict_to_hf(state_dict): + new_state_dict = {} + for key, value in state_dict.items(): + if key.endswith(".inv_freq"): + continue + for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): + if key_to_modify in key: + key = key.replace(key_to_modify, new_key) + + new_state_dict[key] = value + new_state_dict["vision_tower.post_layernorm.weight"] = torch.zeros((1152,)) + new_state_dict["vision_tower.post_layernorm.bias"] = torch.zeros((1152,)) + + return new_state_dict + + +def convert_aria_llama_to_hf(text_model_id, vision_model_id, output_hub_path, old_state_dict_id): + torch.set_default_dtype(torch.float16) + + tokenizer = AutoTokenizer.from_pretrained( + text_model_id, + extra_special_tokens={ + "image_token": "<|img|>", + "pad_token": "", + }, + ) + tokenizer.add_tokens(AddedToken("<|img|>", special=True, normalized=False), special_tokens=True) + tokenizer.add_special_tokens({"pad_token": ""}) + tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}{% elif message['content'] is iterable %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<|img|>{% endif %}{% endfor %}{% endif %}<|im_end|>\n{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}" + + processor = AriaProcessor.from_pretrained( + text_model_id, + tokenizer=tokenizer, + ) + + config = AutoConfig.from_pretrained(text_model_id) + config.vision_config.hidden_size = 1152 + config.vision_config.attention_heads = 16 + config.pad_token_id = 2 + config.image_token_index = 9 + config.intermediate_size = config.moe_intermediate_size + config.auto_map = { + "AutoConfig": "modeling_aria.AriaConfig", + "AutoModelForCausalLM": "modeling_aria.AriaForConditionalGeneration", + } + + with torch.device("meta"): + model = AriaForConditionalGeneration(config) + + state_dict = load_original_state_dict(old_state_dict_id) + + state_dict = convert_state_dict_to_hf(state_dict) + model.load_state_dict(state_dict, strict=False, assign=True) + + # print("Saving models") + # model.save_pretrained("local_aria", safe_serialization=False) + # processor.save_pretrained("local_aria") + print("Pushing to hub") + model.push_to_hub(output_hub_path, create_pr=True) + processor.push_to_hub(output_hub_path, create_pr=True) + + +def main(): + parser = argparse.ArgumentParser( + epilog=EPILOG_TXT, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--text_model_id", + default="rhymes-ai/Aria", + help="Hub location of the text model", + ) + parser.add_argument( + "--vision_model_id", + default="rhymes-ai/Aria", + help="Hub location of the vision model", + ) + parser.add_argument( + "--output_hub_path", + default="rhymes-ai/Aria", + help="Location on the hub of the converted model", + ) + parser.add_argument( + "--old_state_dict_id", + default="rhymes-ai/Aria", + help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`", + ) + args = parser.parse_args() + convert_aria_llama_to_hf(args.text_model_id, args.vision_model_id, args.output_hub_path, args.old_state_dict_id) + + +if __name__ == "__main__": + main() diff --git a/src/transformers/models/aria/image_processing_aria.py b/src/transformers/models/aria/image_processing_aria.py new file mode 100644 index 00000000000000..7b00665aa2859d --- /dev/null +++ b/src/transformers/models/aria/image_processing_aria.py @@ -0,0 +1,504 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/aria/modular_aria.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_aria.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 The Rhymes-AI Teams Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from typing import Iterable, List, Optional, Tuple, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, select_best_resolution +from ...image_transforms import PaddingMode, convert_to_rgb, pad, resize, to_channel_dimension_format +from ...image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_valid_image, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType + + +def make_batched_images(images) -> List[List[ImageInput]]: + """ + Accepts images in list or nested list format, and makes a list of images for preprocessing. + + Args: + images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): + The input image. + + Returns: + list: A list of images. + """ + if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): + return [img for img_list in images for img in img_list] + + elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): + return images + + elif is_valid_image(images): + return [images] + + raise ValueError(f"Could not make batched video from {images}") + + +def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]: + """ + Divides an image into patches of a specified size. + + Args: + image (`np.array`): + The input image. + patch_size (`int`): + The size of each patch. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + list: A list of np.array representing the patches. + """ + patches = [] + height, width = get_image_size(image, channel_dim=input_data_format) + for i in range(0, height, patch_size): + for j in range(0, width, patch_size): + if input_data_format == ChannelDimension.LAST: + patch = image[i : i + patch_size, j : j + patch_size] + else: + patch = image[:, i : i + patch_size, j : j + patch_size] + patches.append(patch) + + return patches + + +def _get_patch_output_size(image, target_resolution, input_data_format): + original_height, original_width = get_image_size(image, channel_dim=input_data_format) + target_height, target_width = target_resolution + + scale_w = target_width / original_width + scale_h = target_height / original_height + + if scale_w < scale_h: + new_width = target_width + new_height = min(math.ceil(original_height * scale_w), target_height) + else: + new_height = target_height + new_width = min(math.ceil(original_width * scale_h), target_width) + + return new_height, new_width + + +class AriaImageProcessor(BaseImageProcessor): + """ + A vision processor for the Aria model that handles image preprocessing. + Initialize the AriaImageProcessor. + + Args: + image_mean (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Mean values for normalization. + image_std (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Standard deviation values for normalization. + max_image_size (`int`, *optional*, defaults to 980): + Maximum image size. + min_image_size (`int`, *optional*, defaults to 336): + Minimum image size. + split_resolutions (`list`, *optional*, defaults to a list of optimal,resolutions as tuples): + The optimal resolutions for splitting the image. + split_image (`bool`, *optional*, defaults to `False`): + Whether to split the image. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. + resample (PILImageResampling, *optional*, defaults to `BICUBIC`): + The resampling filter to use if resizing the image. + """ + + def __init__( + self, + image_mean: List[float] = None, + image_std: List[float] = None, + max_image_size: int = 980, + min_image_size: int = 336, + split_resolutions: Optional[List[Tuple[int, int]]] = None, + split_image: Optional[bool] = False, + do_convert_rgb: Optional[bool] = True, + do_normalize: Optional[bool] = True, + resample: PILImageResampling = PILImageResampling.BICUBIC, + **kwargs, + ): + super().__init__(**kwargs) + + if image_mean is None: + image_mean = [0.5, 0.5, 0.5] + if image_std is None: + image_std = [0.5, 0.5, 0.5] + self.max_image_size = max_image_size + self.min_image_size = min_image_size + self.image_mean = image_mean + self.image_std = image_std + self.split_image = split_image + if split_resolutions is None: + split_resolutions = [(1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (2, 4), (2, 3), (2, 2), (2, 1), (3, 1), (3, 2), (4, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1)] # fmt: skip + split_resolutions = [(el[0] * 490, el[1] * 490) for el in split_resolutions] + self.split_resolutions = split_resolutions + self.do_convert_rgb = do_convert_rgb + self.do_normalize = do_normalize + self.resample = resample + + def preprocess( + self, + images: Union[ImageInput, List[ImageInput]], + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + max_image_size: Optional[int] = None, + min_image_size: Optional[int] = None, + split_image: Optional[bool] = None, + do_convert_rgb: Optional[bool] = None, + do_normalize: Optional[bool] = None, + resample: PILImageResampling = None, + return_tensors: Optional[Union[str, TensorType]] = "pt", + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Process a list of images. + + Args: + images (ImageInput or list of ImageInput): + The input image or a list of images. + image_mean (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Mean values for normalization. + image_std (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Standard deviation values for normalization. + max_image_size (`int`, *optional*, defaults to `self.max_image_size` (980)): + Maximum image size. + min_image_size (`int`, *optional*, defaults to `self.min_image_size` (336)): + Minimum image size. + split_image (`bool`, *optional*, defaults to `self.split_image` (False)): + Whether to split the image. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb` (True)): + Whether to convert the image to RGB. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize` (True)): + Whether to normalize the image. + resample (PILImageResampling, *optional*, defaults to `self.resample` (BICUBIC)): + The resampling filter to use if resizing the image. + return_tensors (`str` or `TensorType`, *optional*, defaults to "pt"): + The type of tensor to return. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: + image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: + image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: + image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: + image in (height, width, num_channels) format. + If unset, will use the inferred format of the input image. + + Returns: + BatchFeature: + A BatchFeature object containing: + - 'pixel_values': + Tensor of processed image pixel values. + - 'pixel_mask': + Boolean pixel mask. This mask is a 2D tensor of shape (max_image_size, max_image_size) where: + - True (1) values indicate pixels that belong to the original resized image. + - False (0) values indicate pixels that are part of the padding. + The mask helps distinguish between actual image content and padded areas in subsequent processing steps. + - 'num_crops': + The maximum number of crops across all images. + """ + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + max_image_size = max_image_size if max_image_size is not None else self.max_image_size + min_image_size = min_image_size if min_image_size is not None else self.min_image_size + split_image = split_image if split_image is not None else self.split_image + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + resample = resample if resample is not None else self.resample + + if max_image_size not in [490, 980]: + raise ValueError("max_image_size must be either 490 or 980") + + images = make_batched_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + resample=resample, + ) + + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + pixel_values = [] + pixel_masks = [] + num_crops = None + + for image in images: + if split_image: + crop_images = self.get_image_patches( + image, + self.split_resolutions, + max_image_size, + resample, + data_format=input_data_format, + input_data_format=input_data_format, + ) + else: + crop_images = [image] + if num_crops is None or len(crop_images) > num_crops: + num_crops = len(crop_images) + + for crop_image in crop_images: + # At this point the scale is the rescaling factor that would bring the image to max_size in its larger dimension + h, w = get_image_size(crop_image) + scale = max_image_size / max(h, w) + if w >= h: + new_size = (max(int(h * scale), min_image_size), max_image_size) # h, w + else: + new_size = (max_image_size, max(int(w * scale), min_image_size)) # h, w + + crop_image_resized = resize( + crop_image, + new_size, + resample=resample, + data_format=input_data_format, + input_data_format=input_data_format, + ) + + padding_bottom, padding_right = max_image_size - new_size[0], max_image_size - new_size[1] + crop_image_padded = pad( + crop_image_resized, + ((0, padding_bottom), (0, padding_right)), + data_format=input_data_format, + input_data_format=input_data_format, + ) + + # Create a pixel mask + pixel_mask = np.zeros((max_image_size, max_image_size), dtype=bool) + pixel_mask[: new_size[0], : new_size[1]] = 1 + pixel_masks.append(pixel_mask) + + if do_normalize: + crop_image_padded = self.normalize( + crop_image_padded / 255.0, + self.image_mean, + self.image_std, + data_format=input_data_format, + input_data_format=input_data_format, + ) + crop_image_padded = ( + to_channel_dimension_format(crop_image_padded, data_format, input_data_format) + if data_format is not None + else crop_image_padded + ) + + pixel_values.append(crop_image_padded) + return BatchFeature( + data={ + "pixel_values": np.stack(pixel_values, axis=0), + "pixel_mask": np.stack(pixel_masks, axis=0), + "num_crops": num_crops, + }, + tensor_type=return_tensors, + ) + + def _resize_for_patching( + self, image: np.array, target_resolution: tuple, resample, input_data_format: ChannelDimension + ) -> np.array: + """ + Resizes an image to a target resolution while maintaining aspect ratio. + + Args: + image (np.array): + The input image. + target_resolution (tuple): + The target resolution (height, width) of the image. + resample (`PILImageResampling`): + Resampling filter to use if resizing the image. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + np.array: The resized and padded image. + """ + new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format) + + # Resize the image + resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format) + + return resized_image + + def _pad_for_patching( + self, image: np.array, target_resolution: tuple, input_data_format: ChannelDimension + ) -> np.array: + """ + Pad an image to a target resolution while maintaining aspect ratio. + """ + target_height, target_width = target_resolution + new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format) + + paste_x = (target_width - new_width) // 2 + paste_y = (target_height - new_height) // 2 + + padded_image = self.pad(image, padding=((paste_y, paste_y), (paste_x, paste_x))) + + return padded_image + + def pad( + self, + image: np.ndarray, + padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]], + mode: PaddingMode = PaddingMode.CONSTANT, + constant_values: Union[float, Iterable[float]] = 0.0, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Pads the `image` with the specified `padding` and `mode`. Padding can be in the (`height`, `width`) + dimension of in the (`num_patches`) dimension. In the second case an iterable if tuples is expected + as input. + + Args: + image (`np.ndarray`): + The image to pad. + padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`): + Padding to apply to the edges of the height, width axes. Can be one of three formats: + - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis. + - `((before, after),)` yields same before and after pad for height and width. + - `(pad,)` or int is a shortcut for before = after = pad width for all axes. + mode (`PaddingMode`): + The padding mode to use. Can be one of: + - `"constant"`: pads with a constant value. + - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the + vector along each axis. + - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis. + - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use the inferred format of the input image. + + Returns: + `np.ndarray`: The padded image. + + """ + + # call the general `pad` if padding on `height/width`, otherwise it's the `num_patched` dim + if isinstance(padding, int) or len(padding) != 4: + return pad(image, padding, mode, constant_values, data_format, input_data_format) + + if input_data_format is None: + input_data_format = infer_channel_dimension_format(image) + + padding_mode_mapping = { + PaddingMode.CONSTANT: "constant", + PaddingMode.REFLECT: "reflect", + PaddingMode.REPLICATE: "edge", + PaddingMode.SYMMETRIC: "symmetric", + } + image = np.pad(image, padding, mode=padding_mode_mapping[mode], constant_values=constant_values) + image = ( + to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image + ) + return image + + def get_image_patches( + self, + image: np.array, + grid_pinpoints: List[Tuple[int, int]], + patch_size: int, + resample: PILImageResampling, + data_format: ChannelDimension, + input_data_format: ChannelDimension, + ) -> List[np.array]: + """ + Process an image with variable resolutions by dividing it into patches. + + Args: + image (`np.array`): + The input image to be processed. + grid_pinpoints (List[Tuple[int, int]]): + A list of possible resolutions as tuples. + patch_size (`int`): + Size of the patches to divide the image into. + resample (`PILImageResampling`): + Resampling filter to use if resizing the image. + data_format (`ChannelDimension` or `str`): + The channel dimension format for the output image. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + `List[np.array]`: A list of NumPy arrays containing the processed image patches. + """ + if not isinstance(grid_pinpoints, list): + raise TypeError("grid_pinpoints must be a list of possible resolutions.") + + possible_resolutions = grid_pinpoints + + image_size = get_image_size(image, channel_dim=input_data_format) + best_resolution = select_best_resolution(image_size, possible_resolutions) + resized_image = self._resize_for_patching( + image, best_resolution, resample=resample, input_data_format=input_data_format + ) + padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=input_data_format) + + patches = divide_to_patches(padded_image, patch_size=patch_size, input_data_format=input_data_format) + + # make sure that all patches are in the input data format + patches = [ + to_channel_dimension_format(patch, channel_dim=data_format, input_channel_dim=input_data_format) + for patch in patches + ] + return patches + + +__all__ = ["AriaImageProcessor"] diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py new file mode 100644 index 00000000000000..1b4e4087b1a49d --- /dev/null +++ b/src/transformers/models/aria/modeling_aria.py @@ -0,0 +1,1920 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/aria/modular_aria.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_aria.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 The Rhymes-AI Teams Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, StaticCache +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_flash_attention_utils import FlashAttentionKwargs, _flash_attention_forward +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, ModelOutput +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import ( + LossKwargs, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from ...utils.import_utils import is_torch_available +from ..auto import AutoModel, AutoModelForCausalLM +from .configuration_aria import AriaConfig, AriaTextConfig + + +if is_torch_available(): + import torch + from torch import nn + + +logger = logging.get_logger(__name__) +_CONFIG_FOR_DOC = "AriaTextConfig" + + +class AriaTextRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + AriaTextRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class AriaProjectorMLP(nn.Module): + """ + Feed-Forward Network module for the Aria Projector. + + Args: + in_features (`int`): + Input embedding dimension. + hidden_features (`int`): + Hidden dimension of the feed-forward network. + output_dim (`int`): + Output dimension. + """ + + def __init__(self, in_features, hidden_features, output_dim): + super().__init__() + self.linear_in = nn.Linear(in_features, hidden_features, bias=False) + self.linear_out = nn.Linear(hidden_features, output_dim, bias=False) + self.act = ACT2FN["gelu_new"] + + def forward(self, hidden_states): + hidden_states = self.act(self.linear_in(hidden_states)) + hidden_states = self.linear_out(hidden_states) + return hidden_states + + +class AriaCrossAttention(nn.Module): + """ + Aria Cross-Attention module. + + Args: + config (`AriaConfig`): + The configuration to use. + """ + + def __init__(self, config: AriaConfig, dropout_rate: float = 0): + super().__init__() + hidden_size = config.vision_config.hidden_size + num_heads = config.vision_config.num_attention_heads + self.num_heads = num_heads + self.q_proj = nn.Linear(hidden_size, hidden_size, bias=False) + self.k_proj = nn.Linear(hidden_size, hidden_size, bias=False) + self.v_proj = nn.Linear(hidden_size, hidden_size, bias=False) + + # Original code here: https://github.com/rhymes-ai/Aria/blob/719ff4e52b727443cba3793b0e27fe64e0244fe1/aria/model/projector.py#L48 + self.multihead_attn = nn.MultiheadAttention(hidden_size, num_heads, batch_first=True) + self.linear = nn.Linear(hidden_size, hidden_size) + self.dropout = nn.Dropout(dropout_rate) + + self.layer_norm = nn.LayerNorm(hidden_size) + self.layer_norm_kv = nn.LayerNorm(hidden_size) + + def forward(self, key_value_states, hidden_states, attn_mask=None): + """ + Forward pass of the AriaCrossAttention module. + + Args: + key_value_states (`torch.Tensor`): + Input tensor for key and value. + hidden_states (`torch.Tensor`): + Input tensor for query. + attn_mask (`torch.Tensor`, *optional*, defaults to None): + Attention mask. + + Returns: + torch.Tensor: + Output tensor after cross-attention. + """ + query = self.q_proj(self.layer_norm(hidden_states)) + + key_value_states = self.layer_norm_kv(key_value_states) + key = self.k_proj(key_value_states) + value = self.v_proj(key_value_states) + + attn_output, _ = self.multihead_attn(query, key, value, attn_mask=attn_mask) + + attn_output = self.dropout(self.linear(attn_output)) + + return attn_output + + +class AriaProjector(nn.Module): + """ + Aria Projector module. + + This module projects vision features into the language model's embedding space, enabling interaction between vision and language components. + + Args: + config (`AriaConfig`): + Configuration object for the model. + """ + + def __init__( + self, + config: AriaConfig, + ): + super().__init__() + + self.patch_to_query_dict = config.projector_patch_to_query_dict + self.in_features = config.vision_config.hidden_size + self.num_heads = config.vision_config.num_attention_heads + self.kv_dim = config.vision_config.hidden_size + self.hidden_features = config.text_config.hidden_size + self.output_dim = config.text_config.hidden_size + + self.query = nn.Parameter(torch.zeros(config.max_value_projector_patch_to_query_dict, self.in_features)) + + self.cross_attn = AriaCrossAttention(config) + + self.layer_norm = nn.LayerNorm(self.in_features) + self.feed_forward = AriaProjectorMLP(self.in_features, self.hidden_features, self.output_dim) + + def forward(self, key_value_states: torch.Tensor, attn_mask: Optional[torch.Tensor] = None): + """ + Forward pass of the Projector module. + + Args: + key_value_states (`torch.Tensor`): + Input tensor of shape (batch_size, num_patches, kv_dim). + attn_mask (`torch.Tensor`, *optional*, default is None): + Attention mask. + + Returns: + `torch.Tensor`: Output tensor of shape (batch_size, query_number, output_dim). + """ + batch_size, num_patches = key_value_states.shape[0], key_value_states.shape[1] + + if num_patches not in self.patch_to_query_dict.keys(): + raise KeyError( + f"Number of patches {num_patches} not found in patch_to_query_dict amongst possible values {self.patch_to_query_dict.keys()}." + ) + query_num = self.patch_to_query_dict[num_patches] + + queries = self.query[:query_num].unsqueeze(0).repeat(batch_size, 1, 1) + + if attn_mask is not None: + attn_mask = attn_mask.repeat_interleave(self.num_heads, 0) + attn_mask = attn_mask.unsqueeze(1).expand(-1, queries.size(1), -1) + + attention_out = self.cross_attn(key_value_states, queries, attn_mask=attn_mask) + + out = self.feed_forward(self.layer_norm(attention_out)) + + return out + + +class AriaSharedExpertsMLP(nn.Module): + """ + Shared Expert MLP for shared experts. + + Unlike routed experts, shared experts process all tokens without routing. + This class reconfigures the intermediate size in comparison to the LlamaMLP. + + Args: + config (`AriaTextConfig`): Configuration object for the Aria language model. + """ + + def __init__(self, config: AriaTextConfig): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size * config.moe_num_shared_experts + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +def sequential_experts_gemm(token_states, expert_weights, tokens_per_expert): + """ + Compute the matrix multiplication (GEMM) for each expert sequentially. This approach is computationally inefficient, especially when dealing with a large number of experts. + + Args: + token_states (torch.Tensor): Input tensor of shape (num_tokens, in_features). + expert_weights (torch.Tensor): Weight tensor of shape (num_experts, in_features, out_features). + tokens_per_expert (torch.Tensor): Number of tokens assigned to each expert. + + Returns: + torch.Tensor: Output tensor of shape (num_tokens, out_features). + """ + num_tokens = token_states.shape[0] + out_features = expert_weights.shape[-1] + output = torch.zeros(num_tokens, out_features, dtype=token_states.dtype, device=token_states.device) + + cumsum_num_tokens = torch.cumsum(tokens_per_expert, dim=0) + # Insert zero at the begining for offset index's convenience + zero_tensor = torch.zeros(1, dtype=torch.long, device=cumsum_num_tokens.device) + cumsum_num_tokens = torch.cat((zero_tensor, cumsum_num_tokens)) + + for expert_num in range(expert_weights.shape[0]): + start = cumsum_num_tokens[expert_num] + end = cumsum_num_tokens[expert_num + 1] + tokens = token_states[start:end] + + out = torch.matmul(tokens, expert_weights[expert_num]) + output[start:end] = out + return output + + +class AriaGroupedExpertsGemm(nn.Module): + """ + Grouped GEMM (General Matrix Multiplication) module for efficient expert computation. + This module utilizes the grouped_gemm library (https://github.com/fanshiqing/grouped_gemm) + for optimized performance. If the grouped_gemm library is not installed, it gracefully + falls back to a sequential GEMM implementation, which may be slower but ensures + functionality. + + Args: + in_features (`int`): + Number of input features. + out_features (`int`): + Number of output features. + groups (`int`): + Number of expert groups. + """ + + def __init__(self, in_features, out_features, groups): + super().__init__() + self.in_features = in_features + self.out_features = out_features + self.groups = groups + self.weight = nn.Parameter(torch.empty(groups, in_features, out_features)) + + def forward(self, input, tokens_per_expert): + """ + Perform grouped matrix multiplication. + + Args: + input (`torch.Tensor`): + Input tensor of shape (num_tokens, in_features). + tokens_per_expert (`torch.Tensor`): + Number of tokens assigned to each expert. + + Returns: + torch.Tensor: Output tensor of shape (num_tokens, out_features). + """ + return sequential_experts_gemm( + input, + self.weight, + tokens_per_expert.cpu(), + ) + + +class AriaGroupedExpertsMLP(nn.Module): + """ + Grouped MLP module for Mixture of Experts. + + Args: + config (`AriaTextConfig`): + Configuration object for the model. + """ + + def __init__(self, config: AriaTextConfig) -> None: + super().__init__() + self.config = config + self.fc1 = AriaGroupedExpertsGemm(config.hidden_size, config.intermediate_size * 2, config.moe_num_experts) + self.fc2 = AriaGroupedExpertsGemm(config.intermediate_size, config.hidden_size, config.moe_num_experts) + + def forward(self, permuted_tokens, tokens_per_expert): + """ + Forward pass of the Grouped MLP. + + Args: + permuted_tokens (torch.Tensor): Permuted input tokens. + tokens_per_expert (torch.Tensor): Number of tokens assigned to each expert. + + Returns: + torch.Tensor: Output tensor after passing through the MLP. + """ + fc1_output = self.fc1(permuted_tokens, tokens_per_expert) + projection, gate = torch.chunk(fc1_output, 2, dim=-1) + fc1_output = nn.functional.silu(projection) * gate + fc2_output = self.fc2(fc1_output, tokens_per_expert) + return fc2_output + + +# Token permutation adapted from https://github.com/NVIDIA/Megatron-LM/blob/54f1f78529cbc2b9cddad313e7f9d96ac0420a27/megatron/core/transformer/moe/token_dispatcher.py#L291-L587 +class AriaTextMoELayer(nn.Module): + """ + Aria Text Mixture of Experts (MoE) Layer. + + This layer applies a gating mechanism to route input tokens to different experts. + + Args: + config (`AriaTextConfig`): + Configuration object for the text component of the model. + """ + + def __init__(self, config: AriaTextConfig): + super().__init__() + + self.router = nn.Linear(config.hidden_size, config.moe_num_experts, bias=False) + self.experts = AriaGroupedExpertsMLP(config) + self.shared_experts = AriaSharedExpertsMLP(config) + self.config = config + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + """ + Forward pass of the MoE Layer. + + Args: + hidden_states (`torch.Tensor`): + Input tensor of shape (batch_size, sequence_length, hidden_size). + + Returns: + torch.Tensor: Output tensor after passing through the MoE layer. + + Process: + 1. Route tokens to experts using the router. + 2. Permute tokens based on routing decisions. + 3. Process tokens through experts. + 4. Unpermute and combine expert outputs. + 5. Add shared expert output to the final result. + """ + original_shape = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_states.size(-1)) + + # Top K Routing + logits = self.router(hidden_states) + top_logits, top_indices = torch.topk(logits, k=self.config.moe_topk, dim=1) + scores = nn.functional.softmax(top_logits, dim=-1) + + original_dtype = top_indices.dtype + + tokens_per_expert = torch.histc( + top_indices.flatten().to(torch.float32), + bins=self.config.moe_num_experts, + min=0, + max=self.config.moe_num_experts - 1, + ).to(original_dtype) + indices = top_indices + + # Token permutation + flatten_indices = indices.view(-1) + sorted_indices = torch.argsort(flatten_indices) + permuted_tokens = hidden_states.index_select(0, sorted_indices // self.config.moe_topk) + + # Process through experts + expert_output = self.experts(permuted_tokens, tokens_per_expert) + + # Token unpermutation + unpermuted_tokens = torch.zeros( + (scores.shape[0] * self.config.moe_topk, expert_output.size(1)), + dtype=expert_output.dtype, + device=expert_output.device, + ) + unpermuted_tokens.index_copy_(0, sorted_indices, expert_output) + unpermuted_tokens = unpermuted_tokens.view(-1, self.config.moe_topk, expert_output.size(1)) + + output = (unpermuted_tokens * scores.unsqueeze(-1)).sum(dim=1).view(original_shape) + + # Add shared expert output + shared_expert_output = self.shared_experts(hidden_states.view(original_shape)) + return output + shared_expert_output + + +class AriaTextRotaryEmbedding(nn.Module): + def __init__( + self, + dim=None, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + rope_type="default", + config: Optional[AriaTextConfig] = None, + ): + super().__init__() + # TODO (joao): remove the `if` below, only used for BC + self.rope_kwargs = {} + if config is None: + logger.warning_once( + "`AriaTextRotaryEmbedding` can now be fully parameterized by passing the model config through the " + "`config` argument. All other arguments will be removed in v4.46" + ) + self.rope_kwargs = { + "rope_type": rope_type, + "factor": scaling_factor, + "dim": dim, + "base": base, + "max_position_embeddings": max_position_embeddings, + } + self.rope_type = rope_type + self.max_seq_len_cached = max_position_embeddings + self.original_max_seq_len = max_position_embeddings + else: + # BC: "rope_type" was originally "type" + if config.rope_scaling is not None: + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + def _dynamic_frequency_update(self, position_ids, device): + """ + dynamic RoPE layers should recompute `inv_freq` in the following situations: + 1 - growing beyond the cached sequence length (allow scaling) + 2 - the current sequence length is in the original scale (avoid losing precision with small sequences) + """ + seq_len = torch.max(position_ids) + 1 + if seq_len > self.max_seq_len_cached: # growth + inv_freq, self.attention_scaling = self.rope_init_fn( + self.config, device, seq_len=seq_len, **self.rope_kwargs + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation + self.max_seq_len_cached = seq_len + + if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset + self.register_buffer("inv_freq", self.original_inv_freq, persistent=False) + self.max_seq_len_cached = self.original_max_seq_len + + @torch.no_grad() + def forward(self, x, position_ids): + if "dynamic" in self.rope_type: + self._dynamic_frequency_update(position_ids, device=x.device) + + # Core RoPE block + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + # Force float32 (see https://github.com/huggingface/transformers/pull/29285) + device_type = x.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() + sin = emb.sin() + + # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention + cos = cos * self.attention_scaling + sin = sin * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class AriaTextAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: AriaTextConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads) + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias) + + # TODO (joao): remove in v4.46 (RoPE is computed in the model, not in the decoder layers) + self.rotary_emb = AriaTextRotaryEmbedding(config=self.config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used + query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + + attn_output = attn_output.reshape(bsz, q_len, -1) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class AriaTextFlashAttention2(AriaTextAttention): + """ + AriaText flash attention module. This module inherits from `AriaTextAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if isinstance(past_key_value, StaticCache): + raise ValueError( + "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` " + "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers" + ) + + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + dropout_rate = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (AriaTextRMSNorm handles it correctly) + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = _flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + position_ids=position_ids, + dropout=dropout_rate, + sliding_window=getattr(self, "sliding_window", None), + use_top_left_mask=self._flash_attn_uses_top_left_mask, + is_causal=self.is_causal, + **kwargs, + ) + + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class AriaTextSdpaAttention(AriaTextAttention): + """ + AriaText attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `AriaTextAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from AriaTextAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "AriaTextModel is using AriaTextSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used + query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + causal_mask = attention_mask + if attention_mask is not None: + causal_mask = causal_mask[:, :, :, : key_states.shape[-2]] + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and causal_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + is_causal = True if causal_mask is None and q_len > 1 else False + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=causal_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + is_causal=is_causal, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, -1) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +ARIA_TEXT_ATTENTION_CLASSES = { + "eager": AriaTextAttention, + "flash_attention_2": AriaTextFlashAttention2, + "sdpa": AriaTextSdpaAttention, +} + + +class AriaTextDecoderLayer(nn.Module): + """ + Aria Text Decoder Layer. + + This class defines a single decoder layer in the language model, incorporating self-attention and Mixture of Experts (MoE) feed-forward network. + + Args: + config (`AriaTextConfig`): + Configuration object for the text component of the model. + layer_idx (`int`): + Index of the layer. + """ + + def __init__(self, config: AriaTextConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = ARIA_TEXT_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx) + self.mlp = AriaTextMoELayer(config) + self.input_layernorm = AriaTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = AriaTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence + position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. + kwargs (`dict`, *optional*): + Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code + into the model + """ + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +class AriaTextPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. + """ + + config_class = AriaConfig + base_model_prefix = "model" + _no_split_modules = ["AriaTextDecoderLayer", "AriaGroupedExpertsGemm"] + supports_gradient_checkpointing = True + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = False + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, AriaGroupedExpertsGemm): + module.weight.data.normal_(mean=0.0, std=std) + elif isinstance(module, nn.Conv2d): + module.weight.data.normal_(mean=0.0, std=std) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.zero_() + + +ARIA_TEXT_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`AriaTextConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Aria Model outputting raw hidden-states without any specific head on top.", + ARIA_TEXT_START_DOCSTRING, +) +class AriaPreTrainedModel(PreTrainedModel): + config_class = AriaTextConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["AriaDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + _supports_quantized_cache = True + _supports_static_cache = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, AriaProjector): + nn.init.trunc_normal_(module.query, std=std) + + +ARIA_TEXT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. +""" + + +@add_start_docstrings( + "The bare AriaText Model outputting raw hidden-states without any specific head on top.", + ARIA_TEXT_START_DOCSTRING, +) +class AriaTextModel(AriaTextPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`AriaTextDecoderLayer`] + + Args: + config: AriaTextConfig + """ + + def __init__(self, config: AriaTextConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [AriaTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = AriaTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = AriaTextRotaryEmbedding(config=config) + self.gradient_checkpointing = False + if getattr(config, "pretraining_tp", 1) != 1: + logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.") + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(ARIA_TEXT_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **flash_attn_kwargs: Unpack[FlashAttentionKwargs], + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + hidden_states = inputs_embeds + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + causal_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + cache_position, + position_embeddings, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **flash_attn_kwargs, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + sequence_length = input_tensor.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + device=device, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to plcae the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + +class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ... + + +class AriaTextForCausalLM(AriaTextPreTrainedModel, GenerationMixin): + """ + Aria model for causal language modeling tasks. + + This class extends `LlamaForCausalLM` to incorporate the Mixture of Experts (MoE) approach, + allowing for more efficient and scalable language modeling. + + Args: + config (`AriaTextConfig`): + Configuration object for the model. + """ + + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + config_class = AriaTextConfig + + def __init__(self, config: AriaTextConfig): + super().__init__(config) + self.model = AriaTextModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(ARIA_TEXT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + num_logits_to_keep: int = 0, + **kwargs: Unpack[KwargsForCausalLM], + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + num_logits_to_keep (`int`, *optional*): + Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all + `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that + token can save memory, which becomes pretty significant for long sequences or large vocabulary size. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, AriaTextForCausalLM + + >>> model = AriaTextForCausalLM.from_pretrained("meta-aria_text/AriaText-2-7b-hf") + >>> tokenizer = AutoTokenizer.from_pretrained("meta-aria_text/AriaText-2-7b-hf") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@dataclass +class AriaCausalLMOutputWithPast(ModelOutput): + """ + Base class for Aria causal language model (or autoregressive) outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + past_key_values: Optional[List[torch.FloatTensor]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None + + +ARIA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor`, *optional*): + Input token IDs. + pixel_values (`torch.FloatTensor`, *optional*): + Pixel values of the images. + pixel_mask (`torch.LongTensor`, *optional*): + Mask for the pixel values. + attention_mask (`torch.Tensor`, *optional*): + Attention mask. + position_ids (`torch.LongTensor`, *optional*): + Position IDs. + past_key_values (`List[torch.FloatTensor]`, *optional*): + Past key values for efficient processing. + inputs_embeds (`torch.FloatTensor`, *optional*): + Input embeddings. + labels (`torch.LongTensor`, *optional*): + Labels for computing the language modeling loss. + use_cache (`bool`, *optional*): + Whether to use the model's cache mechanism. + output_attentions (`bool`, *optional*): + Whether to output attention weights. + output_hidden_states (`bool`, *optional*): + Whether to output hidden states. + return_dict (`bool`, *optional*): + Whether to return a `ModelOutput` object. + num_logits_to_keep (`int`, *optional*, defaults to 0): + Calculate logits for the last `num_logits_to_keep` tokens, or all `input_ids` if `0`. + cache_position (`torch.LongTensor`, *optional*): + Cache positions. + **loss_kwargs: + Additional keyword arguments for loss calculation. +""" + +ARIA_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config (`AriaConfig`): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + """Aria model for conditional generation tasks. + + This model combines a vision tower, a multi-modal projector, and a language model + to perform tasks that involve both image and text inputs.""", + ARIA_START_DOCSTRING, +) +class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin): + config_class = AriaConfig + _supports_flash_attn_2 = False + _supports_sdpa = False + + def __init__(self, config: AriaConfig): + super().__init__(config) + + self.vision_tower = AutoModel.from_config(config.vision_config) + self.multi_modal_projector = AriaProjector(config) + self.vocab_size = config.text_config.vocab_size + self.language_model = AutoModelForCausalLM.from_config(config.text_config) + self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 + self._use_flash_attention_2 = config.text_config._attn_implementation == "flash_attention_2" + self.post_init() + + def _create_patch_attention_mask(self, pixel_mask): + if pixel_mask is None: + return None + + patches_subgrid = pixel_mask.unfold( + dimension=1, + size=self.vision_tower.config.patch_size, + step=self.vision_tower.config.patch_size, + ) + patches_subgrid = patches_subgrid.unfold( + dimension=2, + size=self.vision_tower.config.patch_size, + step=self.vision_tower.config.patch_size, + ) + return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def get_output_embeddings(self): + return self.language_model.get_output_embeddings() + + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + def set_decoder(self, decoder): + self.language_model.set_decoder(decoder) + + def get_decoder(self): + return self.language_model.get_decoder() + + def tie_weights(self): + return self.language_model.tie_weights() + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + pixel_mask: torch.FloatTensor = None, + vision_feature_layer: int = -1, + ): + patch_attention_mask = self._create_patch_attention_mask(pixel_mask) + image_outputs = self.vision_tower( + pixel_values, patch_attention_mask=patch_attention_mask, output_hidden_states=True + ) + image_attn_mask = None + if patch_attention_mask is not None: + flattened_mask = patch_attention_mask.flatten(1) + image_attn_mask = torch.logical_not(flattened_mask) + + selected_image_feature = image_outputs.hidden_states[vision_feature_layer] + image_features = self.multi_modal_projector(selected_image_feature, attn_mask=image_attn_mask) + return image_features + + @add_start_docstrings_to_model_forward(ARIA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=AriaCausalLMOutputWithPast, config_class=AriaConfig) + def forward( + self, + input_ids: torch.LongTensor = None, + pixel_values: torch.FloatTensor = None, + pixel_mask: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + num_logits_to_keep: int = 0, + cache_position: Optional[torch.LongTensor] = None, + **loss_kwargs, + ) -> Union[Tuple, AriaCausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics3ForConditionalGeneration`). + Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only + computed for the tokens with labels in `[0, ..., config.vocab_size]`. + Returns: + + Example: + + ```python + >>> import requests + >>> import torch + >>> from PIL import Image + >>> from io import BytesIO + + >>> from transformers import AutoProcessor, AutoModel + >>> from transformers.image_utils import load_image + + >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible + >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg") + >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg") + >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg") + + >>> processor = AutoProcessor.from_pretrained("Rhymes-AI/Aria") + >>> model = AutoModel.from_pretrained("Rhymes-AI/Aria", torch_dtype=torch.bfloat16, device_map="auto") + + >>> # Create inputs + >>> messages = [ + ... { + ... "role": "user", + ... "content": [ + ... {"type": "image"}, + ... {"type": "text", "text": "In this image, we can see the city of New York, and more specifically the Statue of Liberty."}, + ... {"type": "image"}, + ... {"type": "text", "text": "What can we see in this image?"}, + ... ] + ... }, + ... { + ... "role": "user", + ... "content": [ + ... {"type": "image"}, + ... {"type": "text", "text": "In which city is that bridge located?"}, + ... ] + ... } + ... ] + + >>> prompts = [processor.apply_chat_template([message], add_generation_prompt=True) for message in messages] + >>> images = [[image1, image2], [image3]] + >>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to(model.device) + + >>> # Generate + >>> generated_ids = model.generate(**inputs, max_new_tokens=256) + >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True) + + >>> print(generated_texts[0]) + Assistant: There are buildings, trees, lights, and water visible in this image. + + >>> print(generated_texts[1]) + Assistant: The bridge is in San Francisco. + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + # 2. Merge text and images + if pixel_values is not None and inputs_embeds.shape[1] != 1: + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_index, dtype=torch.long, device=inputs_embeds.device) + ) + n_image_tokens = (special_image_mask).sum(dim=1).sum(dim=0)[0] + else: + image_embeds = input_ids == self.config.image_token_index + special_image_mask = image_embeds.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + n_image_tokens = (image_embeds).sum(dim=1).sum(dim=0) + image_features = self.get_image_features( + pixel_values=pixel_values, + pixel_mask=pixel_mask, + vision_feature_layer=self.config.vision_feature_layer, + ) + n_images, n_features_per_image = image_features.shape[0], image_features.shape[1] + n_image_features = n_images * n_features_per_image + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + num_logits_to_keep=num_logits_to_keep, + ) + + logits = outputs[0] + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **loss_kwargs + ) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return AriaCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + pixel_mask=None, + attention_mask=None, + cache_position=None, + num_logits_to_keep=None, + **kwargs, + ): + model_inputs = self.language_model.prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + num_logits_to_keep=num_logits_to_keep, + **kwargs, + ) + + if cache_position[0] == 0: + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = pixel_values + model_inputs["pixel_mask"] = pixel_mask + + return model_inputs + + +__all__ = [ + "AriaForConditionalGeneration", + "AriaPreTrainedModel", + "AriaTextPreTrainedModel", + "AriaTextModel", + "AriaTextForCausalLM", +] diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py new file mode 100644 index 00000000000000..78c6e08bdfd0e5 --- /dev/null +++ b/src/transformers/models/aria/modular_aria.py @@ -0,0 +1,1598 @@ +# coding=utf-8 +# Copyright 2024 The Rhymes-AI Teams Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from typing import Dict, Iterable, List, Optional, Tuple, Union + +import numpy as np + +from ...activations import ACT2FN +from ...configuration_utils import PretrainedConfig +from ...generation import GenerationMixin +from ...image_processing_utils import BaseImageProcessor, BatchFeature, select_best_resolution +from ...image_transforms import PaddingMode, convert_to_rgb, pad, resize, to_channel_dimension_format +from ...image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...modeling_utils import PreTrainedModel +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils import ( + PreTokenizedInput, + TextInput, +) +from ...utils import ( + TensorType, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from ...utils.import_utils import is_torch_available +from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer +from ..llama.configuration_llama import LlamaConfig +from ..llama.modeling_llama import ( + LlamaDecoderLayer, + LlamaForCausalLM, + LlamaMLP, + LlamaModel, + LlamaPreTrainedModel, + LlamaRMSNorm, +) +from ..llava.modeling_llava import LlavaCausalLMOutputWithPast +from ..llava_next.image_processing_llava_next import divide_to_patches, make_batched_images + + +logger = logging.get_logger(__name__) + +if is_torch_available(): + import torch + from torch import nn + + +def sequential_experts_gemm(token_states, expert_weights, tokens_per_expert): + """ + Compute the matrix multiplication (GEMM) for each expert sequentially. This approach is computationally inefficient, especially when dealing with a large number of experts. + + Args: + token_states (torch.Tensor): Input tensor of shape (num_tokens, in_features). + expert_weights (torch.Tensor): Weight tensor of shape (num_experts, in_features, out_features). + tokens_per_expert (torch.Tensor): Number of tokens assigned to each expert. + + Returns: + torch.Tensor: Output tensor of shape (num_tokens, out_features). + """ + num_tokens = token_states.shape[0] + out_features = expert_weights.shape[-1] + output = torch.zeros(num_tokens, out_features, dtype=token_states.dtype, device=token_states.device) + + cumsum_num_tokens = torch.cumsum(tokens_per_expert, dim=0) + # Insert zero at the begining for offset index's convenience + zero_tensor = torch.zeros(1, dtype=torch.long, device=cumsum_num_tokens.device) + cumsum_num_tokens = torch.cat((zero_tensor, cumsum_num_tokens)) + + for expert_num in range(expert_weights.shape[0]): + start = cumsum_num_tokens[expert_num] + end = cumsum_num_tokens[expert_num + 1] + tokens = token_states[start:end] + + out = torch.matmul(tokens, expert_weights[expert_num]) + output[start:end] = out + return output + + +class AriaTextConfig(LlamaConfig): + r""" + This class handles the configuration for the text component of the Aria model. + Instantiating a configuration with the defaults will yield a similar configuration to that of the model of the Aria + [rhymes-ai/Aria](https://huggingface.co/rhymes-ai/Aria) architecture. + This class extends the LlamaConfig to include additional parameters specific to the Mixture of Experts (MoE) architecture. + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`LlamaModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 4096): + The size of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens, + Llama 2 up to 4096, CodeLlama up to 16384. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*, defaults to 2): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + pretraining_tp (`int`, *optional*, defaults to 1): + Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this + document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to + understand more about it. This value is necessary to ensure exact reproducibility of the pretraining + results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232). + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers. + head_dim (`int`, *optional*): + The attention head dimension. If None, it will default to hidden_size // num_heads + moe_num_experts (`int`, *optional*, defaults to 8): + The number of experts in the MoE layer. + moe_topk (`int`, *optional*, defaults to 2): + The number of top experts to route to for each token. + moe_num_shared_experts (`int`, *optional*, defaults to 2): + The number of shared experts. + """ + + model_type = "aria_text" + base_config_key = "text_config" + + def __init__( + self, + intermediate_size: int = 4096, + moe_num_experts: int = 8, + moe_topk: int = 2, + moe_num_shared_experts: int = 2, + pad_token_id=2, + **super_kwargs, + ): + super().__init__(pad_token_id=pad_token_id, **super_kwargs) + self.intermediate_size = intermediate_size + self.moe_num_experts = moe_num_experts + self.moe_topk = moe_topk + self.moe_num_shared_experts = moe_num_shared_experts + + +class AriaConfig(PretrainedConfig): + r""" + This class handles the configuration for both vision and text components of the Aria model, + as well as additional parameters for image token handling and projector mapping. + Instantiating a configuration with the defaults will yield a similar configuration to that of the model of the Aria + [rhymes-ai/Aria](https://huggingface.co/rhymes-ai/Aria) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`AriaVisionConfig` or `dict`, *optional*): + Configuration for the vision component. + vision_feature_layer (`int`, *optional*, defaults to -1): + The index of the layer to select the vision feature. + text_config (`AriaTextConfig` or `dict`, *optional*): + Configuration for the text component. + projector_patch_to_query_dict (`dict`, *optional*): + Mapping of patch sizes to query dimensions. + image_token_index (`int`, *optional*, defaults to 9): + Index used to represent image tokens. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated normal initializer for initializing all weight matrices. + + Attributes: + model_type (`str`): + Type of the model, set to `"aria"`. + image_token_index (`int`): + Index used to represent image tokens. + projector_patch_to_query_dict (`dict`): + Mapping of patch sizes to query dimensions. + vision_config (`AriaVisionConfig`): + Configuration for the vision component. + text_config (`AriaTextConfig`): + Configuration for the text component. + """ + + model_type = "aria" + sub_configs = {"text_config": AriaTextConfig, "vision_config": AutoConfig} + + def __init__( + self, + vision_config=None, + vision_feature_layer: int = -1, + text_config: AriaTextConfig = None, + projector_patch_to_query_dict: Dict = None, + image_token_index: int = 9, + initializer_range: float = 0.02, + **kwargs, + ): + self.image_token_index = image_token_index + + # Convert the keys and values of projector_patch_to_query_dict to integers + # This ensures consistency even if they were provided as strings + if projector_patch_to_query_dict is None: + projector_patch_to_query_dict = { + 1225: 128, + 4900: 256, + } + self.projector_patch_to_query_dict = {int(k): int(v) for k, v in projector_patch_to_query_dict.items()} + self.max_value_projector_patch_to_query_dict = max(self.projector_patch_to_query_dict.values()) + self.vision_feature_layer = vision_feature_layer + if isinstance(vision_config, dict): + vision_config["model_type"] = "idefics3_vision" + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + elif vision_config is None: + vision_config = CONFIG_MAPPING["idefics3_vision"]() + + self.vision_config = vision_config + self.initializer_range = initializer_range + + if isinstance(text_config, dict) and "model_type" in text_config: + text_config = AriaTextConfig(**text_config) + elif text_config is None: + text_config = AriaTextConfig() + + self.text_config = text_config + + super().__init__(**kwargs) + + +class AriaTextRMSNorm(LlamaRMSNorm): + pass + + +class AriaProjectorMLP(nn.Module): + """ + Feed-Forward Network module for the Aria Projector. + + Args: + in_features (`int`): + Input embedding dimension. + hidden_features (`int`): + Hidden dimension of the feed-forward network. + output_dim (`int`): + Output dimension. + """ + + def __init__(self, in_features, hidden_features, output_dim): + super().__init__() + self.linear_in = nn.Linear(in_features, hidden_features, bias=False) + self.linear_out = nn.Linear(hidden_features, output_dim, bias=False) + self.act = ACT2FN["gelu_new"] + + def forward(self, hidden_states): + hidden_states = self.act(self.linear_in(hidden_states)) + hidden_states = self.linear_out(hidden_states) + return hidden_states + + +class AriaCrossAttention(nn.Module): + """ + Aria Cross-Attention module. + + Args: + config (`AriaConfig`): + The configuration to use. + """ + + def __init__(self, config: AriaConfig, dropout_rate: float = 0): + super().__init__() + hidden_size = config.vision_config.hidden_size + num_heads = config.vision_config.num_attention_heads + self.num_heads = num_heads + self.q_proj = nn.Linear(hidden_size, hidden_size, bias=False) + self.k_proj = nn.Linear(hidden_size, hidden_size, bias=False) + self.v_proj = nn.Linear(hidden_size, hidden_size, bias=False) + + # Original code here: https://github.com/rhymes-ai/Aria/blob/719ff4e52b727443cba3793b0e27fe64e0244fe1/aria/model/projector.py#L48 + self.multihead_attn = nn.MultiheadAttention(hidden_size, num_heads, batch_first=True) + self.linear = nn.Linear(hidden_size, hidden_size) + self.dropout = nn.Dropout(dropout_rate) + + self.layer_norm = nn.LayerNorm(hidden_size) + self.layer_norm_kv = nn.LayerNorm(hidden_size) + + def forward(self, key_value_states, hidden_states, attn_mask=None): + """ + Forward pass of the AriaCrossAttention module. + + Args: + key_value_states (`torch.Tensor`): + Input tensor for key and value. + hidden_states (`torch.Tensor`): + Input tensor for query. + attn_mask (`torch.Tensor`, *optional*, defaults to None): + Attention mask. + + Returns: + torch.Tensor: + Output tensor after cross-attention. + """ + query = self.q_proj(self.layer_norm(hidden_states)) + + key_value_states = self.layer_norm_kv(key_value_states) + key = self.k_proj(key_value_states) + value = self.v_proj(key_value_states) + + attn_output, _ = self.multihead_attn(query, key, value, attn_mask=attn_mask) + + attn_output = self.dropout(self.linear(attn_output)) + + return attn_output + + +class AriaProjector(nn.Module): + """ + Aria Projector module. + + This module projects vision features into the language model's embedding space, enabling interaction between vision and language components. + + Args: + config (`AriaConfig`): + Configuration object for the model. + """ + + def __init__( + self, + config: AriaConfig, + ): + super().__init__() + + self.patch_to_query_dict = config.projector_patch_to_query_dict + self.in_features = config.vision_config.hidden_size + self.num_heads = config.vision_config.num_attention_heads + self.kv_dim = config.vision_config.hidden_size + self.hidden_features = config.text_config.hidden_size + self.output_dim = config.text_config.hidden_size + + self.query = nn.Parameter(torch.zeros(config.max_value_projector_patch_to_query_dict, self.in_features)) + + self.cross_attn = AriaCrossAttention(config) + + self.layer_norm = nn.LayerNorm(self.in_features) + self.feed_forward = AriaProjectorMLP(self.in_features, self.hidden_features, self.output_dim) + + def forward(self, key_value_states: torch.Tensor, attn_mask: Optional[torch.Tensor] = None): + """ + Forward pass of the Projector module. + + Args: + key_value_states (`torch.Tensor`): + Input tensor of shape (batch_size, num_patches, kv_dim). + attn_mask (`torch.Tensor`, *optional*, default is None): + Attention mask. + + Returns: + `torch.Tensor`: Output tensor of shape (batch_size, query_number, output_dim). + """ + batch_size, num_patches = key_value_states.shape[0], key_value_states.shape[1] + + if num_patches not in self.patch_to_query_dict.keys(): + raise KeyError( + f"Number of patches {num_patches} not found in patch_to_query_dict amongst possible values {self.patch_to_query_dict.keys()}." + ) + query_num = self.patch_to_query_dict[num_patches] + + queries = self.query[:query_num].unsqueeze(0).repeat(batch_size, 1, 1) + + if attn_mask is not None: + attn_mask = attn_mask.repeat_interleave(self.num_heads, 0) + attn_mask = attn_mask.unsqueeze(1).expand(-1, queries.size(1), -1) + + attention_out = self.cross_attn(key_value_states, queries, attn_mask=attn_mask) + + out = self.feed_forward(self.layer_norm(attention_out)) + + return out + + +def _get_patch_output_size(image, target_resolution, input_data_format): + original_height, original_width = get_image_size(image, channel_dim=input_data_format) + target_height, target_width = target_resolution + + scale_w = target_width / original_width + scale_h = target_height / original_height + + if scale_w < scale_h: + new_width = target_width + new_height = min(math.ceil(original_height * scale_w), target_height) + else: + new_height = target_height + new_width = min(math.ceil(original_width * scale_h), target_width) + + return new_height, new_width + + +class AriaImageProcessor(BaseImageProcessor): + """ + A vision processor for the Aria model that handles image preprocessing. + Initialize the AriaImageProcessor. + + Args: + image_mean (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Mean values for normalization. + image_std (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Standard deviation values for normalization. + max_image_size (`int`, *optional*, defaults to 980): + Maximum image size. + min_image_size (`int`, *optional*, defaults to 336): + Minimum image size. + split_resolutions (`list`, *optional*, defaults to a list of optimal,resolutions as tuples): + The optimal resolutions for splitting the image. + split_image (`bool`, *optional*, defaults to `False`): + Whether to split the image. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. + resample (PILImageResampling, *optional*, defaults to `BICUBIC`): + The resampling filter to use if resizing the image. + """ + + def __init__( + self, + image_mean: List[float] = None, + image_std: List[float] = None, + max_image_size: int = 980, + min_image_size: int = 336, + split_resolutions: Optional[List[Tuple[int, int]]] = None, + split_image: Optional[bool] = False, + do_convert_rgb: Optional[bool] = True, + do_normalize: Optional[bool] = True, + resample: PILImageResampling = PILImageResampling.BICUBIC, + **kwargs, + ): + super().__init__(**kwargs) + + if image_mean is None: + image_mean = [0.5, 0.5, 0.5] + if image_std is None: + image_std = [0.5, 0.5, 0.5] + self.max_image_size = max_image_size + self.min_image_size = min_image_size + self.image_mean = image_mean + self.image_std = image_std + self.split_image = split_image + if split_resolutions is None: + split_resolutions = [(1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (2, 4), (2, 3), (2, 2), (2, 1), (3, 1), (3, 2), (4, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1)] # fmt: skip + split_resolutions = [(el[0] * 490, el[1] * 490) for el in split_resolutions] + self.split_resolutions = split_resolutions + self.do_convert_rgb = do_convert_rgb + self.do_normalize = do_normalize + self.resample = resample + + def preprocess( + self, + images: Union[ImageInput, List[ImageInput]], + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + max_image_size: Optional[int] = None, + min_image_size: Optional[int] = None, + split_image: Optional[bool] = None, + do_convert_rgb: Optional[bool] = None, + do_normalize: Optional[bool] = None, + resample: PILImageResampling = None, + return_tensors: Optional[Union[str, TensorType]] = "pt", + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Process a list of images. + + Args: + images (ImageInput or list of ImageInput): + The input image or a list of images. + image_mean (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Mean values for normalization. + image_std (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Standard deviation values for normalization. + max_image_size (`int`, *optional*, defaults to `self.max_image_size` (980)): + Maximum image size. + min_image_size (`int`, *optional*, defaults to `self.min_image_size` (336)): + Minimum image size. + split_image (`bool`, *optional*, defaults to `self.split_image` (False)): + Whether to split the image. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb` (True)): + Whether to convert the image to RGB. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize` (True)): + Whether to normalize the image. + resample (PILImageResampling, *optional*, defaults to `self.resample` (BICUBIC)): + The resampling filter to use if resizing the image. + return_tensors (`str` or `TensorType`, *optional*, defaults to "pt"): + The type of tensor to return. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: + image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: + image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: + image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: + image in (height, width, num_channels) format. + If unset, will use the inferred format of the input image. + + Returns: + BatchFeature: + A BatchFeature object containing: + - 'pixel_values': + Tensor of processed image pixel values. + - 'pixel_mask': + Boolean pixel mask. This mask is a 2D tensor of shape (max_image_size, max_image_size) where: + - True (1) values indicate pixels that belong to the original resized image. + - False (0) values indicate pixels that are part of the padding. + The mask helps distinguish between actual image content and padded areas in subsequent processing steps. + - 'num_crops': + The maximum number of crops across all images. + """ + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + max_image_size = max_image_size if max_image_size is not None else self.max_image_size + min_image_size = min_image_size if min_image_size is not None else self.min_image_size + split_image = split_image if split_image is not None else self.split_image + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + resample = resample if resample is not None else self.resample + + if max_image_size not in [490, 980]: + raise ValueError("max_image_size must be either 490 or 980") + + images = make_batched_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + resample=resample, + ) + + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + pixel_values = [] + pixel_masks = [] + num_crops = None + + for image in images: + if split_image: + crop_images = self.get_image_patches( + image, + self.split_resolutions, + max_image_size, + resample, + data_format=input_data_format, + input_data_format=input_data_format, + ) + else: + crop_images = [image] + if num_crops is None or len(crop_images) > num_crops: + num_crops = len(crop_images) + + for crop_image in crop_images: + # At this point the scale is the rescaling factor that would bring the image to max_size in its larger dimension + h, w = get_image_size(crop_image) + scale = max_image_size / max(h, w) + if w >= h: + new_size = (max(int(h * scale), min_image_size), max_image_size) # h, w + else: + new_size = (max_image_size, max(int(w * scale), min_image_size)) # h, w + + crop_image_resized = resize( + crop_image, + new_size, + resample=resample, + data_format=input_data_format, + input_data_format=input_data_format, + ) + + padding_bottom, padding_right = max_image_size - new_size[0], max_image_size - new_size[1] + crop_image_padded = pad( + crop_image_resized, + ((0, padding_bottom), (0, padding_right)), + data_format=input_data_format, + input_data_format=input_data_format, + ) + + # Create a pixel mask + pixel_mask = np.zeros((max_image_size, max_image_size), dtype=bool) + pixel_mask[: new_size[0], : new_size[1]] = 1 + pixel_masks.append(pixel_mask) + + if do_normalize: + crop_image_padded = self.normalize( + crop_image_padded / 255.0, + self.image_mean, + self.image_std, + data_format=input_data_format, + input_data_format=input_data_format, + ) + crop_image_padded = ( + to_channel_dimension_format(crop_image_padded, data_format, input_data_format) + if data_format is not None + else crop_image_padded + ) + + pixel_values.append(crop_image_padded) + return BatchFeature( + data={ + "pixel_values": np.stack(pixel_values, axis=0), + "pixel_mask": np.stack(pixel_masks, axis=0), + "num_crops": num_crops, + }, + tensor_type=return_tensors, + ) + + def _resize_for_patching( + self, image: np.array, target_resolution: tuple, resample, input_data_format: ChannelDimension + ) -> np.array: + """ + Resizes an image to a target resolution while maintaining aspect ratio. + + Args: + image (np.array): + The input image. + target_resolution (tuple): + The target resolution (height, width) of the image. + resample (`PILImageResampling`): + Resampling filter to use if resizing the image. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + np.array: The resized and padded image. + """ + new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format) + + # Resize the image + resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format) + + return resized_image + + def _pad_for_patching( + self, image: np.array, target_resolution: tuple, input_data_format: ChannelDimension + ) -> np.array: + """ + Pad an image to a target resolution while maintaining aspect ratio. + """ + target_height, target_width = target_resolution + new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format) + + paste_x = (target_width - new_width) // 2 + paste_y = (target_height - new_height) // 2 + + padded_image = self.pad(image, padding=((paste_y, paste_y), (paste_x, paste_x))) + + return padded_image + + def pad( + self, + image: np.ndarray, + padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]], + mode: PaddingMode = PaddingMode.CONSTANT, + constant_values: Union[float, Iterable[float]] = 0.0, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Pads the `image` with the specified `padding` and `mode`. Padding can be in the (`height`, `width`) + dimension of in the (`num_patches`) dimension. In the second case an iterable if tuples is expected + as input. + + Args: + image (`np.ndarray`): + The image to pad. + padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`): + Padding to apply to the edges of the height, width axes. Can be one of three formats: + - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis. + - `((before, after),)` yields same before and after pad for height and width. + - `(pad,)` or int is a shortcut for before = after = pad width for all axes. + mode (`PaddingMode`): + The padding mode to use. Can be one of: + - `"constant"`: pads with a constant value. + - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the + vector along each axis. + - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis. + - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use the inferred format of the input image. + + Returns: + `np.ndarray`: The padded image. + + """ + + # call the general `pad` if padding on `height/width`, otherwise it's the `num_patched` dim + if isinstance(padding, int) or len(padding) != 4: + return pad(image, padding, mode, constant_values, data_format, input_data_format) + + if input_data_format is None: + input_data_format = infer_channel_dimension_format(image) + + padding_mode_mapping = { + PaddingMode.CONSTANT: "constant", + PaddingMode.REFLECT: "reflect", + PaddingMode.REPLICATE: "edge", + PaddingMode.SYMMETRIC: "symmetric", + } + image = np.pad(image, padding, mode=padding_mode_mapping[mode], constant_values=constant_values) + image = ( + to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image + ) + return image + + def get_image_patches( + self, + image: np.array, + grid_pinpoints: List[Tuple[int, int]], + patch_size: int, + resample: PILImageResampling, + data_format: ChannelDimension, + input_data_format: ChannelDimension, + ) -> List[np.array]: + """ + Process an image with variable resolutions by dividing it into patches. + + Args: + image (`np.array`): + The input image to be processed. + grid_pinpoints (List[Tuple[int, int]]): + A list of possible resolutions as tuples. + patch_size (`int`): + Size of the patches to divide the image into. + resample (`PILImageResampling`): + Resampling filter to use if resizing the image. + data_format (`ChannelDimension` or `str`): + The channel dimension format for the output image. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + `List[np.array]`: A list of NumPy arrays containing the processed image patches. + """ + if not isinstance(grid_pinpoints, list): + raise TypeError("grid_pinpoints must be a list of possible resolutions.") + + possible_resolutions = grid_pinpoints + + image_size = get_image_size(image, channel_dim=input_data_format) + best_resolution = select_best_resolution(image_size, possible_resolutions) + resized_image = self._resize_for_patching( + image, best_resolution, resample=resample, input_data_format=input_data_format + ) + padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=input_data_format) + + patches = divide_to_patches(padded_image, patch_size=patch_size, input_data_format=input_data_format) + + # make sure that all patches are in the input data format + patches = [ + to_channel_dimension_format(patch, channel_dim=data_format, input_channel_dim=input_data_format) + for patch in patches + ] + return patches + + +class AriaProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "padding": False, + }, + "images_kwargs": { + "max_image_size": 980, + "split_image": False, + }, + "return_tensors": TensorType.PYTORCH, + } + + +class AriaProcessor(ProcessorMixin): + """ + AriaProcessor is a processor for the Aria model which wraps the Aria image preprocessor and the LLama slow tokenizer. + + Args: + image_processor (`AriaImageProcessor`, *optional*): + The AriaImageProcessor to use for image preprocessing. + tokenizer (`PreTrainedTokenizerBase`, *optional*): + An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input. + chat_template (`str`, *optional*): + A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. + size_conversion (`Dict`, *optional*): + A dictionary indicating size conversions for images. + """ + + attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["chat_template", "size_conversion"] + image_processor_class = "AriaImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor=None, + tokenizer: Union[AutoTokenizer, str] = None, + chat_template: Optional[str] = None, + size_conversion: Optional[Dict[Union[float, int], int]] = None, + ): + if size_conversion is None: + size_conversion = {490: 128, 980: 256} + self.size_conversion = {int(k): v for k, v in size_conversion.items()} + + if tokenizer is not None and tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.unk_token + + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], + images: Optional[ImageInput] = None, + audio=None, + videos=None, + **kwargs: Unpack[AriaProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). + + Args: + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`ImageInput`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + - **pixel_mask** -- Pixel mask to be fed to a model. Returned when `images` is not `None`. + """ + output_kwargs = self._merge_kwargs( + AriaProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + if images is not None: + image_inputs = self.image_processor( + images, + **output_kwargs["images_kwargs"], + ) + # expand the image_token according to the num_crops and tokens per image + tokens_per_image = self.size_conversion[image_inputs.pixel_values.shape[2]] + prompt_strings = [] + num_crops = image_inputs.pop("num_crops") * tokens_per_image + for sample in text: + sample = sample.replace(self.tokenizer.image_token, self.tokenizer.image_token * num_crops) + prompt_strings.append(sample) + + else: + image_inputs = {} + prompt_strings = text + + text_inputs = self.tokenizer( + prompt_strings, + **output_kwargs["text_kwargs"], + ) + + return BatchFeature(data={**text_inputs, **image_inputs}) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +class AriaSharedExpertsMLP(LlamaMLP): + """ + Shared Expert MLP for shared experts. + + Unlike routed experts, shared experts process all tokens without routing. + This class reconfigures the intermediate size in comparison to the LlamaMLP. + + Args: + config (`AriaTextConfig`): Configuration object for the Aria language model. + """ + + def __init__(self, config: AriaTextConfig): + super().__init__(self) + self.intermediate_size = config.intermediate_size * config.moe_num_shared_experts + + +class AriaGroupedExpertsGemm(nn.Module): + """ + Grouped GEMM (General Matrix Multiplication) module for efficient expert computation. + This module utilizes the grouped_gemm library (https://github.com/fanshiqing/grouped_gemm) + for optimized performance. If the grouped_gemm library is not installed, it gracefully + falls back to a sequential GEMM implementation, which may be slower but ensures + functionality. + + Args: + in_features (`int`): + Number of input features. + out_features (`int`): + Number of output features. + groups (`int`): + Number of expert groups. + """ + + def __init__(self, in_features, out_features, groups): + super().__init__() + self.in_features = in_features + self.out_features = out_features + self.groups = groups + self.weight = nn.Parameter(torch.empty(groups, in_features, out_features)) + + def forward(self, input, tokens_per_expert): + """ + Perform grouped matrix multiplication. + + Args: + input (`torch.Tensor`): + Input tensor of shape (num_tokens, in_features). + tokens_per_expert (`torch.Tensor`): + Number of tokens assigned to each expert. + + Returns: + torch.Tensor: Output tensor of shape (num_tokens, out_features). + """ + return sequential_experts_gemm( + input, + self.weight, + tokens_per_expert.cpu(), + ) + + +class AriaGroupedExpertsMLP(nn.Module): + """ + Grouped MLP module for Mixture of Experts. + + Args: + config (`AriaTextConfig`): + Configuration object for the model. + """ + + def __init__(self, config: AriaTextConfig) -> None: + super().__init__() + self.config = config + self.fc1 = AriaGroupedExpertsGemm(config.hidden_size, config.intermediate_size * 2, config.moe_num_experts) + self.fc2 = AriaGroupedExpertsGemm(config.intermediate_size, config.hidden_size, config.moe_num_experts) + + def forward(self, permuted_tokens, tokens_per_expert): + """ + Forward pass of the Grouped MLP. + + Args: + permuted_tokens (torch.Tensor): Permuted input tokens. + tokens_per_expert (torch.Tensor): Number of tokens assigned to each expert. + + Returns: + torch.Tensor: Output tensor after passing through the MLP. + """ + fc1_output = self.fc1(permuted_tokens, tokens_per_expert) + projection, gate = torch.chunk(fc1_output, 2, dim=-1) + fc1_output = nn.functional.silu(projection) * gate + fc2_output = self.fc2(fc1_output, tokens_per_expert) + return fc2_output + + +# Token permutation adapted from https://github.com/NVIDIA/Megatron-LM/blob/54f1f78529cbc2b9cddad313e7f9d96ac0420a27/megatron/core/transformer/moe/token_dispatcher.py#L291-L587 +class AriaTextMoELayer(nn.Module): + """ + Aria Text Mixture of Experts (MoE) Layer. + + This layer applies a gating mechanism to route input tokens to different experts. + + Args: + config (`AriaTextConfig`): + Configuration object for the text component of the model. + """ + + def __init__(self, config: AriaTextConfig): + super().__init__() + + self.router = nn.Linear(config.hidden_size, config.moe_num_experts, bias=False) + self.experts = AriaGroupedExpertsMLP(config) + self.shared_experts = AriaSharedExpertsMLP(config) + self.config = config + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + """ + Forward pass of the MoE Layer. + + Args: + hidden_states (`torch.Tensor`): + Input tensor of shape (batch_size, sequence_length, hidden_size). + + Returns: + torch.Tensor: Output tensor after passing through the MoE layer. + + Process: + 1. Route tokens to experts using the router. + 2. Permute tokens based on routing decisions. + 3. Process tokens through experts. + 4. Unpermute and combine expert outputs. + 5. Add shared expert output to the final result. + """ + original_shape = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_states.size(-1)) + + # Top K Routing + logits = self.router(hidden_states) + top_logits, top_indices = torch.topk(logits, k=self.config.moe_topk, dim=1) + scores = nn.functional.softmax(top_logits, dim=-1) + + original_dtype = top_indices.dtype + + tokens_per_expert = torch.histc( + top_indices.flatten().to(torch.float32), + bins=self.config.moe_num_experts, + min=0, + max=self.config.moe_num_experts - 1, + ).to(original_dtype) + indices = top_indices + + # Token permutation + flatten_indices = indices.view(-1) + sorted_indices = torch.argsort(flatten_indices) + permuted_tokens = hidden_states.index_select(0, sorted_indices // self.config.moe_topk) + + # Process through experts + expert_output = self.experts(permuted_tokens, tokens_per_expert) + + # Token unpermutation + unpermuted_tokens = torch.zeros( + (scores.shape[0] * self.config.moe_topk, expert_output.size(1)), + dtype=expert_output.dtype, + device=expert_output.device, + ) + unpermuted_tokens.index_copy_(0, sorted_indices, expert_output) + unpermuted_tokens = unpermuted_tokens.view(-1, self.config.moe_topk, expert_output.size(1)) + + output = (unpermuted_tokens * scores.unsqueeze(-1)).sum(dim=1).view(original_shape) + + # Add shared expert output + shared_expert_output = self.shared_experts(hidden_states.view(original_shape)) + return output + shared_expert_output + + +class AriaTextDecoderLayer(LlamaDecoderLayer): + """ + Aria Text Decoder Layer. + + This class defines a single decoder layer in the language model, incorporating self-attention and Mixture of Experts (MoE) feed-forward network. + + Args: + config (`AriaTextConfig`): + Configuration object for the text component of the model. + layer_idx (`int`): + Index of the layer. + """ + + def __init__(self, config: AriaTextConfig, layer_idx: int): + super().__init__(self) + self.mlp = AriaTextMoELayer(config) + + +class AriaTextPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. + """ + + config_class = AriaConfig + base_model_prefix = "model" + _no_split_modules = ["AriaTextDecoderLayer", "AriaGroupedExpertsGemm"] + supports_gradient_checkpointing = True + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = False + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, AriaGroupedExpertsGemm): + module.weight.data.normal_(mean=0.0, std=std) + elif isinstance(module, nn.Conv2d): + module.weight.data.normal_(mean=0.0, std=std) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.zero_() + + +class AriaPreTrainedModel(LlamaPreTrainedModel): + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, AriaProjector): + nn.init.trunc_normal_(module.query, std=std) + + +class AriaTextModel(LlamaModel): + def __init__(self, config: AriaTextConfig): + super().__init__(config) + self.layers = nn.ModuleList( + [AriaTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + self.post_init() + + +class AriaTextForCausalLM(AriaTextPreTrainedModel, LlamaForCausalLM): + """ + Aria model for causal language modeling tasks. + + This class extends `LlamaForCausalLM` to incorporate the Mixture of Experts (MoE) approach, + allowing for more efficient and scalable language modeling. + + Args: + config (`AriaTextConfig`): + Configuration object for the model. + """ + + _tied_weights_keys = ["lm_head.weight"] + config_class = AriaTextConfig + + def __init__(self, config: AriaTextConfig): + super().__init__(config) + self.model = AriaTextModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + +class AriaCausalLMOutputWithPast(LlavaCausalLMOutputWithPast): + pass + + +ARIA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor`, *optional*): + Input token IDs. + pixel_values (`torch.FloatTensor`, *optional*): + Pixel values of the images. + pixel_mask (`torch.LongTensor`, *optional*): + Mask for the pixel values. + attention_mask (`torch.Tensor`, *optional*): + Attention mask. + position_ids (`torch.LongTensor`, *optional*): + Position IDs. + past_key_values (`List[torch.FloatTensor]`, *optional*): + Past key values for efficient processing. + inputs_embeds (`torch.FloatTensor`, *optional*): + Input embeddings. + labels (`torch.LongTensor`, *optional*): + Labels for computing the language modeling loss. + use_cache (`bool`, *optional*): + Whether to use the model's cache mechanism. + output_attentions (`bool`, *optional*): + Whether to output attention weights. + output_hidden_states (`bool`, *optional*): + Whether to output hidden states. + return_dict (`bool`, *optional*): + Whether to return a `ModelOutput` object. + num_logits_to_keep (`int`, *optional*, defaults to 0): + Calculate logits for the last `num_logits_to_keep` tokens, or all `input_ids` if `0`. + cache_position (`torch.LongTensor`, *optional*): + Cache positions. + **loss_kwargs: + Additional keyword arguments for loss calculation. +""" + +ARIA_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config (`AriaConfig`): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + """Aria model for conditional generation tasks. + + This model combines a vision tower, a multi-modal projector, and a language model + to perform tasks that involve both image and text inputs.""", + ARIA_START_DOCSTRING, +) +class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin): + config_class = AriaConfig + _supports_flash_attn_2 = False + _supports_sdpa = False + + def __init__(self, config: AriaConfig): + super().__init__(config) + + self.vision_tower = AutoModel.from_config(config.vision_config) + self.multi_modal_projector = AriaProjector(config) + self.vocab_size = config.text_config.vocab_size + self.language_model = AutoModelForCausalLM.from_config(config.text_config) + self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 + self._use_flash_attention_2 = config.text_config._attn_implementation == "flash_attention_2" + self.post_init() + + def _create_patch_attention_mask(self, pixel_mask): + if pixel_mask is None: + return None + + patches_subgrid = pixel_mask.unfold( + dimension=1, + size=self.vision_tower.config.patch_size, + step=self.vision_tower.config.patch_size, + ) + patches_subgrid = patches_subgrid.unfold( + dimension=2, + size=self.vision_tower.config.patch_size, + step=self.vision_tower.config.patch_size, + ) + return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def get_output_embeddings(self): + return self.language_model.get_output_embeddings() + + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + def set_decoder(self, decoder): + self.language_model.set_decoder(decoder) + + def get_decoder(self): + return self.language_model.get_decoder() + + def tie_weights(self): + return self.language_model.tie_weights() + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + pixel_mask: torch.FloatTensor = None, + vision_feature_layer: int = -1, + ): + patch_attention_mask = self._create_patch_attention_mask(pixel_mask) + image_outputs = self.vision_tower( + pixel_values, patch_attention_mask=patch_attention_mask, output_hidden_states=True + ) + image_attn_mask = None + if patch_attention_mask is not None: + flattened_mask = patch_attention_mask.flatten(1) + image_attn_mask = torch.logical_not(flattened_mask) + + selected_image_feature = image_outputs.hidden_states[vision_feature_layer] + image_features = self.multi_modal_projector(selected_image_feature, attn_mask=image_attn_mask) + return image_features + + @add_start_docstrings_to_model_forward(ARIA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=AriaCausalLMOutputWithPast, config_class=AriaConfig) + def forward( + self, + input_ids: torch.LongTensor = None, + pixel_values: torch.FloatTensor = None, + pixel_mask: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + num_logits_to_keep: int = 0, + cache_position: Optional[torch.LongTensor] = None, + **loss_kwargs, + ) -> Union[Tuple, AriaCausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics3ForConditionalGeneration`). + Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only + computed for the tokens with labels in `[0, ..., config.vocab_size]`. + Returns: + + Example: + + ```python + >>> import requests + >>> import torch + >>> from PIL import Image + >>> from io import BytesIO + + >>> from transformers import AutoProcessor, AutoModel + >>> from transformers.image_utils import load_image + + >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible + >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg") + >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg") + >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg") + + >>> processor = AutoProcessor.from_pretrained("Rhymes-AI/Aria") + >>> model = AutoModel.from_pretrained("Rhymes-AI/Aria", torch_dtype=torch.bfloat16, device_map="auto") + + >>> # Create inputs + >>> messages = [ + ... { + ... "role": "user", + ... "content": [ + ... {"type": "image"}, + ... {"type": "text", "text": "In this image, we can see the city of New York, and more specifically the Statue of Liberty."}, + ... {"type": "image"}, + ... {"type": "text", "text": "What can we see in this image?"}, + ... ] + ... }, + ... { + ... "role": "user", + ... "content": [ + ... {"type": "image"}, + ... {"type": "text", "text": "In which city is that bridge located?"}, + ... ] + ... } + ... ] + + >>> prompts = [processor.apply_chat_template([message], add_generation_prompt=True) for message in messages] + >>> images = [[image1, image2], [image3]] + >>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to(model.device) + + >>> # Generate + >>> generated_ids = model.generate(**inputs, max_new_tokens=256) + >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True) + + >>> print(generated_texts[0]) + Assistant: There are buildings, trees, lights, and water visible in this image. + + >>> print(generated_texts[1]) + Assistant: The bridge is in San Francisco. + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + # 2. Merge text and images + if pixel_values is not None and inputs_embeds.shape[1] != 1: + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_index, dtype=torch.long, device=inputs_embeds.device) + ) + n_image_tokens = (special_image_mask).sum(dim=1).sum(dim=0)[0] + else: + image_embeds = input_ids == self.config.image_token_index + special_image_mask = image_embeds.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + n_image_tokens = (image_embeds).sum(dim=1).sum(dim=0) + image_features = self.get_image_features( + pixel_values=pixel_values, + pixel_mask=pixel_mask, + vision_feature_layer=self.config.vision_feature_layer, + ) + n_images, n_features_per_image = image_features.shape[0], image_features.shape[1] + n_image_features = n_images * n_features_per_image + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + num_logits_to_keep=num_logits_to_keep, + ) + + logits = outputs[0] + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **loss_kwargs + ) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return AriaCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + pixel_mask=None, + attention_mask=None, + cache_position=None, + num_logits_to_keep=None, + **kwargs, + ): + model_inputs = self.language_model.prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + num_logits_to_keep=num_logits_to_keep, + **kwargs, + ) + + if cache_position[0] == 0: + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = pixel_values + model_inputs["pixel_mask"] = pixel_mask + + return model_inputs + + +__all__ = [ + "AriaConfig", + "AriaTextConfig", + "AriaImageProcessor", + "AriaProcessor", + "AriaForConditionalGeneration", + "AriaPreTrainedModel", + "AriaTextPreTrainedModel", + "AriaTextModel", + "AriaTextForCausalLM", +] diff --git a/src/transformers/models/aria/processing_aria.py b/src/transformers/models/aria/processing_aria.py new file mode 100644 index 00000000000000..2cfbd72a002061 --- /dev/null +++ b/src/transformers/models/aria/processing_aria.py @@ -0,0 +1,164 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/aria/modular_aria.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_aria.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 The Rhymes-AI Teams Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict, List, Optional, Union + +from ...image_processing_utils import BatchFeature +from ...image_utils import ImageInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils import PreTokenizedInput, TextInput +from ...utils import TensorType +from ..auto import AutoTokenizer + + +class AriaProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "padding": False, + }, + "images_kwargs": { + "max_image_size": 980, + "split_image": False, + }, + "return_tensors": TensorType.PYTORCH, + } + + +class AriaProcessor(ProcessorMixin): + """ + AriaProcessor is a processor for the Aria model which wraps the Aria image preprocessor and the LLama slow tokenizer. + + Args: + image_processor (`AriaImageProcessor`, *optional*): + The AriaImageProcessor to use for image preprocessing. + tokenizer (`PreTrainedTokenizerBase`, *optional*): + An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input. + chat_template (`str`, *optional*): + A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. + size_conversion (`Dict`, *optional*): + A dictionary indicating size conversions for images. + """ + + attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["chat_template", "size_conversion"] + image_processor_class = "AriaImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor=None, + tokenizer: Union[AutoTokenizer, str] = None, + chat_template: Optional[str] = None, + size_conversion: Optional[Dict[Union[float, int], int]] = None, + ): + if size_conversion is None: + size_conversion = {490: 128, 980: 256} + self.size_conversion = {int(k): v for k, v in size_conversion.items()} + + if tokenizer is not None and tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.unk_token + + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], + images: Optional[ImageInput] = None, + audio=None, + videos=None, + **kwargs: Unpack[AriaProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). + + Args: + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`ImageInput`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + - **pixel_mask** -- Pixel mask to be fed to a model. Returned when `images` is not `None`. + """ + output_kwargs = self._merge_kwargs( + AriaProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + if images is not None: + image_inputs = self.image_processor( + images, + **output_kwargs["images_kwargs"], + ) + # expand the image_token according to the num_crops and tokens per image + tokens_per_image = self.size_conversion[image_inputs.pixel_values.shape[2]] + prompt_strings = [] + num_crops = image_inputs.pop("num_crops") * tokens_per_image + for sample in text: + sample = sample.replace(self.tokenizer.image_token, self.tokenizer.image_token * num_crops) + prompt_strings.append(sample) + + else: + image_inputs = {} + prompt_strings = text + + text_inputs = self.tokenizer( + prompt_strings, + **output_kwargs["text_kwargs"], + ) + + return BatchFeature(data={**text_inputs, **image_inputs}) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +__all__ = ["AriaProcessor"] diff --git a/src/transformers/models/audio_spectrogram_transformer/__init__.py b/src/transformers/models/audio_spectrogram_transformer/__init__.py index 9f1d65e1aac839..3fe10d60c03a92 100644 --- a/src/transformers/models/audio_spectrogram_transformer/__init__.py +++ b/src/transformers/models/audio_spectrogram_transformer/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2021 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,47 +13,17 @@ # limitations under the License. from typing import TYPE_CHECKING -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available - - -_import_structure = { - "configuration_audio_spectrogram_transformer": ["ASTConfig"], - "feature_extraction_audio_spectrogram_transformer": ["ASTFeatureExtractor"], -} - -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_audio_spectrogram_transformer"] = [ - "ASTForAudioClassification", - "ASTModel", - "ASTPreTrainedModel", - ] +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure if TYPE_CHECKING: - from .configuration_audio_spectrogram_transformer import ( - ASTConfig, - ) - from .feature_extraction_audio_spectrogram_transformer import ASTFeatureExtractor - - try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_audio_spectrogram_transformer import ( - ASTForAudioClassification, - ASTModel, - ASTPreTrainedModel, - ) - - + from .configuration_audio_spectrogram_transformer import * + from .convert_audio_spectrogram_transformer_original_to_pytorch import * + from .feature_extraction_audio_spectrogram_transformer import * + from .modeling_audio_spectrogram_transformer import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py index 7980667a68d7c5..77bec930236f60 100644 --- a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py @@ -126,3 +126,6 @@ def __init__( # generative parameters deprecation cycle, overwriting this function prevents this from happening. def _get_non_default_generation_parameters(self) -> Dict[str, Any]: return {} + + +__all__ = ["ASTConfig"] diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py index 2bd122b4098c36..b181afe19e9ef8 100644 --- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py @@ -234,3 +234,6 @@ def __call__( padded_inputs = padded_inputs.convert_to_tensors(return_tensors) return padded_inputs + + +__all__ = ["ASTFeatureExtractor"] diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py index 491c6ce164611a..a9fe0d75f5c380 100644 --- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py @@ -670,3 +670,6 @@ def forward( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + + +__all__ = ["ASTForAudioClassification", "ASTModel", "ASTPreTrainedModel"] diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 7f0182b50085c5..cc3a7d5baaeb49 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -35,6 +35,8 @@ ("albert", "AlbertConfig"), ("align", "AlignConfig"), ("altclip", "AltCLIPConfig"), + ("aria", "AriaConfig"), + ("aria_text", "AriaTextConfig"), ("audio-spectrogram-transformer", "ASTConfig"), ("autoformer", "AutoformerConfig"), ("bark", "BarkConfig"), @@ -135,6 +137,8 @@ ("idefics", "IdeficsConfig"), ("idefics2", "Idefics2Config"), ("idefics3", "Idefics3Config"), + ("idefics3_vision", "Idefics3VisionConfig"), + ("ijepa", "IJepaConfig"), ("imagegpt", "ImageGPTConfig"), ("informer", "InformerConfig"), ("instructblip", "InstructBlipConfig"), @@ -195,7 +199,7 @@ ("nougat", "VisionEncoderDecoderConfig"), ("nystromformer", "NystromformerConfig"), ("olmo", "OlmoConfig"), - ("olmo_1124", "Olmo1124Config"), + ("olmo2", "Olmo2Config"), ("olmoe", "OlmoeConfig"), ("omdet-turbo", "OmDetTurboConfig"), ("oneformer", "OneFormerConfig"), @@ -326,6 +330,8 @@ ("albert", "ALBERT"), ("align", "ALIGN"), ("altclip", "AltCLIP"), + ("aria", "Aria"), + ("aria_text", "AriaText"), ("audio-spectrogram-transformer", "Audio Spectrogram Transformer"), ("autoformer", "Autoformer"), ("bark", "Bark"), @@ -440,6 +446,8 @@ ("idefics", "IDEFICS"), ("idefics2", "Idefics2"), ("idefics3", "Idefics3"), + ("idefics3_vision", "Idefics3VisionTransformer"), + ("ijepa", "I-JEPA"), ("imagegpt", "ImageGPT"), ("informer", "Informer"), ("instructblip", "InstructBLIP"), @@ -511,7 +519,7 @@ ("nougat", "Nougat"), ("nystromformer", "Nyströmformer"), ("olmo", "OLMo"), - ("olmo_1124", "OLMo November 2024"), + ("olmo2", "OLMo2"), ("olmoe", "OLMoE"), ("omdet-turbo", "OmDet-Turbo"), ("oneformer", "OneFormer"), @@ -685,6 +693,8 @@ ("clip_vision_model", "clip"), ("qwen2_audio_encoder", "qwen2_audio"), ("clip_text_model", "clip"), + ("aria_text", "aria"), + ("idefics3_vision", "idefics3"), ("siglip_vision_model", "siglip"), ("chinese_clip_vision_model", "chinese_clip"), ("rt_detr_resnet", "rt_detr"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index a8960d80acc838..a699314f858928 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -54,6 +54,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict( [ ("align", ("EfficientNetImageProcessor",)), + ("aria", ("AriaImageProcessor")), ("beit", ("BeitImageProcessor",)), ("bit", ("BitImageProcessor",)), ("blip", ("BlipImageProcessor",)), @@ -68,7 +69,7 @@ ("convnextv2", ("ConvNextImageProcessor",)), ("cvt", ("ConvNextImageProcessor",)), ("data2vec-vision", ("BeitImageProcessor",)), - ("deformable_detr", ("DeformableDetrImageProcessor",)), + ("deformable_detr", ("DeformableDetrImageProcessor", "DeformableDetrImageProcessorFast")), ("deit", ("DeiTImageProcessor",)), ("depth_anything", ("DPTImageProcessor",)), ("deta", ("DetaImageProcessor",)), @@ -90,6 +91,7 @@ ("idefics", ("IdeficsImageProcessor",)), ("idefics2", ("Idefics2ImageProcessor",)), ("idefics3", ("Idefics3ImageProcessor",)), + ("ijepa", ("ViTImageProcessor", "ViTImageProcessorFast")), ("imagegpt", ("ImageGPTImageProcessor",)), ("instructblip", ("BlipImageProcessor",)), ("instructblipvideo", ("InstructBlipVideoImageProcessor",)), @@ -117,7 +119,7 @@ ("paligemma", ("SiglipImageProcessor",)), ("perceiver", ("PerceiverImageProcessor",)), ("pix2struct", ("Pix2StructImageProcessor",)), - ("pixtral", ("PixtralImageProcessor",)), + ("pixtral", ("PixtralImageProcessor", "PixtralImageProcessorFast")), ("poolformer", ("PoolFormerImageProcessor",)), ("pvt", ("PvtImageProcessor",)), ("pvt_v2", ("PvtImageProcessor",)), @@ -433,7 +435,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): if image_processor_class is None and image_processor_auto_map is None: if not isinstance(config, PretrainedConfig): config = AutoConfig.from_pretrained( - pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs + pretrained_model_name_or_path, + trust_remote_code=trust_remote_code, + **kwargs, ) # It could be in `config.image_processor_type`` image_processor_class = getattr(config, "image_processor_type", None) diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 5206972b72efde..e8e4814e6a0f6a 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -35,6 +35,8 @@ ("albert", "AlbertModel"), ("align", "AlignModel"), ("altclip", "AltCLIPModel"), + ("aria", "AriaForConditionalGeneration"), + ("aria_text", "AriaTextModel"), ("audio-spectrogram-transformer", "ASTModel"), ("autoformer", "AutoformerModel"), ("bark", "BarkModel"), @@ -132,6 +134,8 @@ ("idefics", "IdeficsModel"), ("idefics2", "Idefics2Model"), ("idefics3", "Idefics3Model"), + ("idefics3_vision", "Idefics3VisionTransformer"), + ("ijepa", "IJepaModel"), ("imagegpt", "ImageGPTModel"), ("informer", "InformerModel"), ("jamba", "JambaModel"), @@ -184,7 +188,7 @@ ("nllb-moe", "NllbMoeModel"), ("nystromformer", "NystromformerModel"), ("olmo", "OlmoModel"), - ("olmo_1124", "Olmo1124Model"), + ("olmo2", "Olmo2Model"), ("olmoe", "OlmoeModel"), ("omdet-turbo", "OmDetTurboForObjectDetection"), ("oneformer", "OneFormerModel"), @@ -463,6 +467,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict( [ # Model for Causal LM mapping + ("aria_text", "AriaTextForCausalLM"), ("bart", "BartForCausalLM"), ("bert", "BertLMHeadModel"), ("bert-generation", "BertGenerationDecoder"), @@ -517,7 +522,7 @@ ("mvp", "MvpForCausalLM"), ("nemotron", "NemotronForCausalLM"), ("olmo", "OlmoForCausalLM"), - ("olmo_1124", "Olmo1124ForCausalLM"), + ("olmo2", "Olmo2ForCausalLM"), ("olmoe", "OlmoeForCausalLM"), ("open-llama", "OpenLlamaForCausalLM"), ("openai-gpt", "OpenAIGPTLMHeadModel"), @@ -578,6 +583,7 @@ ("focalnet", "FocalNetModel"), ("glpn", "GLPNModel"), ("hiera", "HieraModel"), + ("ijepa", "IJepaModel"), ("imagegpt", "ImageGPTModel"), ("levit", "LevitModel"), ("mllama", "MllamaVisionModel"), @@ -655,6 +661,7 @@ ("efficientnet", "EfficientNetForImageClassification"), ("focalnet", "FocalNetForImageClassification"), ("hiera", "HieraForImageClassification"), + ("ijepa", "IJepaForImageClassification"), ("imagegpt", "ImageGPTForImageClassification"), ( "levit", @@ -765,6 +772,7 @@ MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict( [ + ("aria", "AriaForConditionalGeneration"), ("blip", "BlipForConditionalGeneration"), ("blip-2", "Blip2ForConditionalGeneration"), ("chameleon", "ChameleonForConditionalGeneration"), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index c1f23bc1cb3f18..3e475b1be211fa 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -47,6 +47,7 @@ [ ("align", "AlignProcessor"), ("altclip", "AltCLIPProcessor"), + ("aria", "AriaProcessor"), ("bark", "BarkProcessor"), ("blip", "BlipProcessor"), ("blip-2", "Blip2Processor"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 4ed67df0e84b52..3cc181ac87adc4 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -68,6 +68,7 @@ ), ), ("align", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ("aria", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("bark", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("bart", ("BartTokenizer", "BartTokenizerFast")), ( @@ -348,7 +349,7 @@ ), ), ("olmo", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), - ("olmo_1124", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), + ("olmo2", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), ("olmoe", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), ( "omdet-turbo", diff --git a/src/transformers/models/bark/__init__.py b/src/transformers/models/bark/__init__.py index 4cb1a606cf6567..6c21cf99976a15 100644 --- a/src/transformers/models/bark/__init__.py +++ b/src/transformers/models/bark/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,63 +13,18 @@ # limitations under the License. from typing import TYPE_CHECKING -from ...utils import ( - OptionalDependencyNotAvailable, - _LazyModule, - is_torch_available, -) +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure -_import_structure = { - "configuration_bark": [ - "BarkCoarseConfig", - "BarkConfig", - "BarkFineConfig", - "BarkSemanticConfig", - ], - "processing_bark": ["BarkProcessor"], -} - -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_bark"] = [ - "BarkFineModel", - "BarkSemanticModel", - "BarkCoarseModel", - "BarkModel", - "BarkPreTrainedModel", - "BarkCausalModel", - ] - if TYPE_CHECKING: - from .configuration_bark import ( - BarkCoarseConfig, - BarkConfig, - BarkFineConfig, - BarkSemanticConfig, - ) - from .processing_bark import BarkProcessor - - try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_bark import ( - BarkCausalModel, - BarkCoarseModel, - BarkFineModel, - BarkModel, - BarkPreTrainedModel, - BarkSemanticModel, - ) - + from .configuration_bark import * + from .convert_suno_to_hf import * + from .generation_configuration_bark import * + from .modeling_bark import * + from .processing_bark import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/bark/configuration_bark.py b/src/transformers/models/bark/configuration_bark.py index a498d1dd19371d..932bad618aa187 100644 --- a/src/transformers/models/bark/configuration_bark.py +++ b/src/transformers/models/bark/configuration_bark.py @@ -298,3 +298,6 @@ def from_sub_model_configs( codec_config=codec_config.to_dict(), **kwargs, ) + + +__all__ = ["BarkCoarseConfig", "BarkConfig", "BarkFineConfig", "BarkSemanticConfig"] diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py index f1c77367e5beb7..9e225ac9ae15c0 100644 --- a/src/transformers/models/bark/modeling_bark.py +++ b/src/transformers/models/bark/modeling_bark.py @@ -1819,3 +1819,13 @@ def _check_and_enable_flash_attn_2( config.coarse_acoustics_config._attn_implementation = config._attn_implementation config.fine_acoustics_config._attn_implementation = config._attn_implementation return config + + +__all__ = [ + "BarkFineModel", + "BarkSemanticModel", + "BarkCoarseModel", + "BarkModel", + "BarkPreTrainedModel", + "BarkCausalModel", +] diff --git a/src/transformers/models/bark/processing_bark.py b/src/transformers/models/bark/processing_bark.py index 53715f3260422c..0bed6ca79f410b 100644 --- a/src/transformers/models/bark/processing_bark.py +++ b/src/transformers/models/bark/processing_bark.py @@ -285,3 +285,6 @@ def __call__( encoded_text["history_prompt"] = voice_preset return encoded_text + + +__all__ = ["BarkProcessor"] diff --git a/src/transformers/models/bart/__init__.py b/src/transformers/models/bart/__init__.py index d538fbb7d34304..11c3f4863f46a1 100644 --- a/src/transformers/models/bart/__init__.py +++ b/src/transformers/models/bart/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,134 +13,20 @@ # limitations under the License. from typing import TYPE_CHECKING -from ...utils import ( - OptionalDependencyNotAvailable, - _LazyModule, - is_flax_available, - is_tf_available, - is_tokenizers_available, - is_torch_available, -) +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure -_import_structure = { - "configuration_bart": ["BartConfig", "BartOnnxConfig"], - "tokenization_bart": ["BartTokenizer"], -} - -try: - if not is_tokenizers_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["tokenization_bart_fast"] = ["BartTokenizerFast"] - -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_bart"] = [ - "BartForCausalLM", - "BartForConditionalGeneration", - "BartForQuestionAnswering", - "BartForSequenceClassification", - "BartModel", - "BartPreTrainedModel", - "BartPretrainedModel", - "PretrainedBartModel", - ] - -try: - if not is_tf_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_tf_bart"] = [ - "TFBartForConditionalGeneration", - "TFBartForSequenceClassification", - "TFBartModel", - "TFBartPretrainedModel", - ] - -try: - if not is_flax_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_flax_bart"] = [ - "FlaxBartDecoderPreTrainedModel", - "FlaxBartForCausalLM", - "FlaxBartForConditionalGeneration", - "FlaxBartForQuestionAnswering", - "FlaxBartForSequenceClassification", - "FlaxBartModel", - "FlaxBartPreTrainedModel", - ] - if TYPE_CHECKING: - from .configuration_bart import BartConfig, BartOnnxConfig - from .tokenization_bart import BartTokenizer - - try: - if not is_tokenizers_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .tokenization_bart_fast import BartTokenizerFast - - try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_bart import ( - BartForCausalLM, - BartForConditionalGeneration, - BartForQuestionAnswering, - BartForSequenceClassification, - BartModel, - BartPreTrainedModel, - BartPretrainedModel, - PretrainedBartModel, - ) - - try: - if not is_tf_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_tf_bart import ( - TFBartForConditionalGeneration, - TFBartForSequenceClassification, - TFBartModel, - TFBartPretrainedModel, - ) - - try: - if not is_flax_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_flax_bart import ( - FlaxBartDecoderPreTrainedModel, - FlaxBartForCausalLM, - FlaxBartForConditionalGeneration, - FlaxBartForQuestionAnswering, - FlaxBartForSequenceClassification, - FlaxBartModel, - FlaxBartPreTrainedModel, - ) - + from .configuration_bart import * + from .convert_bart_original_pytorch_checkpoint_to_pytorch import * + from .modeling_bart import * + from .modeling_flax_bart import * + from .modeling_tf_bart import * + from .tokenization_bart import * + from .tokenization_bart_fast import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/bart/configuration_bart.py b/src/transformers/models/bart/configuration_bart.py index a3bc7f38653a8a..4ce4316e3c0315 100644 --- a/src/transformers/models/bart/configuration_bart.py +++ b/src/transformers/models/bart/configuration_bart.py @@ -400,3 +400,6 @@ def _flatten_past_key_values_(self, flattened_output, name, idx, t): flattened_output = super(OnnxSeq2SeqConfigWithPast, self)._flatten_past_key_values_( flattened_output, name, idx, t ) + + +__all__ = ["BartConfig", "BartOnnxConfig"] diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index 07c1fa622ea3b6..dd1b69c8127fb8 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -2158,3 +2158,15 @@ def _reorder_cache(past_key_values, beam_idx): tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), ) return reordered_past + + +__all__ = [ + "BartForCausalLM", + "BartForConditionalGeneration", + "BartForQuestionAnswering", + "BartForSequenceClassification", + "BartModel", + "BartPreTrainedModel", + "BartPretrainedModel", + "PretrainedBartModel", +] diff --git a/src/transformers/models/bart/modeling_flax_bart.py b/src/transformers/models/bart/modeling_flax_bart.py index 634c256fe7d81d..b346eaa39fc199 100644 --- a/src/transformers/models/bart/modeling_flax_bart.py +++ b/src/transformers/models/bart/modeling_flax_bart.py @@ -1993,3 +1993,14 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs): FlaxCausalLMOutputWithCrossAttentions, _CONFIG_FOR_DOC, ) + + +__all__ = [ + "FlaxBartDecoderPreTrainedModel", + "FlaxBartForCausalLM", + "FlaxBartForConditionalGeneration", + "FlaxBartForQuestionAnswering", + "FlaxBartForSequenceClassification", + "FlaxBartModel", + "FlaxBartPreTrainedModel", +] diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py index 5ebde8cba60c45..7ab9817986e6ad 100644 --- a/src/transformers/models/bart/modeling_tf_bart.py +++ b/src/transformers/models/bart/modeling_tf_bart.py @@ -1709,3 +1709,6 @@ def build(self, input_shape=None): if getattr(self, "classification_head", None) is not None: with tf.name_scope(self.classification_head.name): self.classification_head.build(None) + + +__all__ = ["TFBartForConditionalGeneration", "TFBartForSequenceClassification", "TFBartModel", "TFBartPretrainedModel"] diff --git a/src/transformers/models/bart/tokenization_bart.py b/src/transformers/models/bart/tokenization_bart.py index 5207b9c92b07ff..4c516cb81be0d2 100644 --- a/src/transformers/models/bart/tokenization_bart.py +++ b/src/transformers/models/bart/tokenization_bart.py @@ -388,3 +388,6 @@ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()): text = " " + text return (text, kwargs) + + +__all__ = ["BartTokenizer"] diff --git a/src/transformers/models/bart/tokenization_bart_fast.py b/src/transformers/models/bart/tokenization_bart_fast.py index e9fb8497c907b9..4586ab4797e5ec 100644 --- a/src/transformers/models/bart/tokenization_bart_fast.py +++ b/src/transformers/models/bart/tokenization_bart_fast.py @@ -274,3 +274,6 @@ def create_token_type_ids_from_sequences( if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + +__all__ = ["BartTokenizerFast"] diff --git a/src/transformers/models/barthez/__init__.py b/src/transformers/models/barthez/__init__.py index 084cd22bdf1d88..323fe2fe8af982 100644 --- a/src/transformers/models/barthez/__init__.py +++ b/src/transformers/models/barthez/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,49 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import TYPE_CHECKING -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_tokenizers_available - - -_import_structure = {} - -try: - if not is_sentencepiece_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["tokenization_barthez"] = ["BarthezTokenizer"] - -try: - if not is_tokenizers_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["tokenization_barthez_fast"] = ["BarthezTokenizerFast"] +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure if TYPE_CHECKING: - try: - if not is_sentencepiece_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .tokenization_barthez import BarthezTokenizer - - try: - if not is_tokenizers_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .tokenization_barthez_fast import BarthezTokenizerFast - + from .tokenization_barthez import * + from .tokenization_barthez_fast import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/barthez/tokenization_barthez.py b/src/transformers/models/barthez/tokenization_barthez.py index 46decddb3e10ba..604f9c7c21519a 100644 --- a/src/transformers/models/barthez/tokenization_barthez.py +++ b/src/transformers/models/barthez/tokenization_barthez.py @@ -284,3 +284,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = fi.write(content_spiece_model) return (out_vocab_file,) + + +__all__ = ["BarthezTokenizer"] diff --git a/src/transformers/models/barthez/tokenization_barthez_fast.py b/src/transformers/models/barthez/tokenization_barthez_fast.py index df8cc7757e96c0..a1d95ef03e4882 100644 --- a/src/transformers/models/barthez/tokenization_barthez_fast.py +++ b/src/transformers/models/barthez/tokenization_barthez_fast.py @@ -192,3 +192,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = copyfile(self.vocab_file, out_vocab_file) return (out_vocab_file,) + + +__all__ = ["BarthezTokenizerFast"] diff --git a/src/transformers/models/bartpho/__init__.py b/src/transformers/models/bartpho/__init__.py index c20d7370c6566c..597be95d8175ca 100644 --- a/src/transformers/models/bartpho/__init__.py +++ b/src/transformers/models/bartpho/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2021 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,32 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import TYPE_CHECKING -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available - +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure -_import_structure = {} - -try: - if not is_sentencepiece_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["tokenization_bartpho"] = ["BartphoTokenizer"] if TYPE_CHECKING: - try: - if not is_sentencepiece_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .tokenization_bartpho import BartphoTokenizer - + from .tokenization_bartpho import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/bartpho/tokenization_bartpho.py b/src/transformers/models/bartpho/tokenization_bartpho.py index df121f26e255f4..e6e4f889842e8f 100644 --- a/src/transformers/models/bartpho/tokenization_bartpho.py +++ b/src/transformers/models/bartpho/tokenization_bartpho.py @@ -311,3 +311,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = fp.write(f"{str(token)} \n") return out_vocab_file, out_monolingual_vocab_file + + +__all__ = ["BartphoTokenizer"] diff --git a/src/transformers/models/beit/__init__.py b/src/transformers/models/beit/__init__.py index c2f49240d6e64c..0fc8919c7ea19a 100644 --- a/src/transformers/models/beit/__init__.py +++ b/src/transformers/models/beit/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2021 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,100 +11,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import TYPE_CHECKING -from ...utils import ( - OptionalDependencyNotAvailable, - _LazyModule, - is_flax_available, - is_torch_available, - is_vision_available, -) - - -_import_structure = {"configuration_beit": ["BeitConfig", "BeitOnnxConfig"]} - -try: - if not is_vision_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["feature_extraction_beit"] = ["BeitFeatureExtractor"] - _import_structure["image_processing_beit"] = ["BeitImageProcessor"] - -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_beit"] = [ - "BeitForImageClassification", - "BeitForMaskedImageModeling", - "BeitForSemanticSegmentation", - "BeitModel", - "BeitPreTrainedModel", - "BeitBackbone", - ] - +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure -try: - if not is_flax_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_flax_beit"] = [ - "FlaxBeitForImageClassification", - "FlaxBeitForMaskedImageModeling", - "FlaxBeitModel", - "FlaxBeitPreTrainedModel", - ] if TYPE_CHECKING: - from .configuration_beit import BeitConfig, BeitOnnxConfig - - try: - if not is_vision_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .feature_extraction_beit import BeitFeatureExtractor - from .image_processing_beit import BeitImageProcessor - - try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_beit import ( - BeitBackbone, - BeitForImageClassification, - BeitForMaskedImageModeling, - BeitForSemanticSegmentation, - BeitModel, - BeitPreTrainedModel, - ) - - try: - if not is_flax_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_flax_beit import ( - FlaxBeitForImageClassification, - FlaxBeitForMaskedImageModeling, - FlaxBeitModel, - FlaxBeitPreTrainedModel, - ) - - + from .configuration_beit import * + from .convert_beit_unilm_to_pytorch import * + from .feature_extraction_beit import * + from .image_processing_beit import * + from .modeling_beit import * + from .modeling_flax_beit import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/beit/configuration_beit.py b/src/transformers/models/beit/configuration_beit.py index f0f3c2582c35cc..834988258c6b75 100644 --- a/src/transformers/models/beit/configuration_beit.py +++ b/src/transformers/models/beit/configuration_beit.py @@ -224,3 +224,6 @@ def inputs(self) -> Mapping[str, Mapping[int, str]]: @property def atol_for_validation(self) -> float: return 1e-4 + + +__all__ = ["BeitConfig", "BeitOnnxConfig"] diff --git a/src/transformers/models/beit/feature_extraction_beit.py b/src/transformers/models/beit/feature_extraction_beit.py index 59dacb4ae51f6e..141d8bc36d2bbb 100644 --- a/src/transformers/models/beit/feature_extraction_beit.py +++ b/src/transformers/models/beit/feature_extraction_beit.py @@ -31,3 +31,6 @@ def __init__(self, *args, **kwargs) -> None: FutureWarning, ) super().__init__(*args, **kwargs) + + +__all__ = ["BeitFeatureExtractor"] diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py index 7398381b2229bf..af76dd2e9656cb 100644 --- a/src/transformers/models/beit/image_processing_beit.py +++ b/src/transformers/models/beit/image_processing_beit.py @@ -510,3 +510,6 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])] return semantic_segmentation + + +__all__ = ["BeitImageProcessor"] diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py index f972e021f3e2b3..01c16ca2cf000b 100755 --- a/src/transformers/models/beit/modeling_beit.py +++ b/src/transformers/models/beit/modeling_beit.py @@ -1576,3 +1576,13 @@ def forward( hidden_states=outputs.hidden_states if output_hidden_states else None, attentions=outputs.attentions, ) + + +__all__ = [ + "BeitForImageClassification", + "BeitForMaskedImageModeling", + "BeitForSemanticSegmentation", + "BeitModel", + "BeitPreTrainedModel", + "BeitBackbone", +] diff --git a/src/transformers/models/beit/modeling_flax_beit.py b/src/transformers/models/beit/modeling_flax_beit.py index c1da64d263a266..2d79c1820088a1 100644 --- a/src/transformers/models/beit/modeling_flax_beit.py +++ b/src/transformers/models/beit/modeling_flax_beit.py @@ -946,3 +946,11 @@ class FlaxBeitForImageClassification(FlaxBeitPreTrainedModel): append_replace_return_docstrings( FlaxBeitForImageClassification, output_type=FlaxSequenceClassifierOutput, config_class=BeitConfig ) + + +__all__ = [ + "FlaxBeitForImageClassification", + "FlaxBeitForMaskedImageModeling", + "FlaxBeitModel", + "FlaxBeitPreTrainedModel", +] diff --git a/src/transformers/models/bert/__init__.py b/src/transformers/models/bert/__init__.py index 17048a5d1c967a..3ed12a889321e6 100644 --- a/src/transformers/models/bert/__init__.py +++ b/src/transformers/models/bert/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,183 +11,26 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import TYPE_CHECKING -from ...utils import ( - OptionalDependencyNotAvailable, - _LazyModule, - is_flax_available, - is_tensorflow_text_available, - is_tf_available, - is_tokenizers_available, - is_torch_available, -) - - -_import_structure = { - "configuration_bert": ["BertConfig", "BertOnnxConfig"], - "tokenization_bert": ["BasicTokenizer", "BertTokenizer", "WordpieceTokenizer"], -} - -try: - if not is_tokenizers_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["tokenization_bert_fast"] = ["BertTokenizerFast"] +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_bert"] = [ - "BertForMaskedLM", - "BertForMultipleChoice", - "BertForNextSentencePrediction", - "BertForPreTraining", - "BertForQuestionAnswering", - "BertForSequenceClassification", - "BertForTokenClassification", - "BertLayer", - "BertLMHeadModel", - "BertModel", - "BertPreTrainedModel", - "load_tf_weights_in_bert", - ] - -try: - if not is_tf_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_tf_bert"] = [ - "TFBertEmbeddings", - "TFBertForMaskedLM", - "TFBertForMultipleChoice", - "TFBertForNextSentencePrediction", - "TFBertForPreTraining", - "TFBertForQuestionAnswering", - "TFBertForSequenceClassification", - "TFBertForTokenClassification", - "TFBertLMHeadModel", - "TFBertMainLayer", - "TFBertModel", - "TFBertPreTrainedModel", - ] -try: - if not is_tensorflow_text_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["tokenization_bert_tf"] = ["TFBertTokenizer"] - -try: - if not is_flax_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_flax_bert"] = [ - "FlaxBertForCausalLM", - "FlaxBertForMaskedLM", - "FlaxBertForMultipleChoice", - "FlaxBertForNextSentencePrediction", - "FlaxBertForPreTraining", - "FlaxBertForQuestionAnswering", - "FlaxBertForSequenceClassification", - "FlaxBertForTokenClassification", - "FlaxBertModel", - "FlaxBertPreTrainedModel", - ] if TYPE_CHECKING: - from .configuration_bert import BertConfig, BertOnnxConfig - from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer - - try: - if not is_tokenizers_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .tokenization_bert_fast import BertTokenizerFast - - try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_bert import ( - BertForMaskedLM, - BertForMultipleChoice, - BertForNextSentencePrediction, - BertForPreTraining, - BertForQuestionAnswering, - BertForSequenceClassification, - BertForTokenClassification, - BertLayer, - BertLMHeadModel, - BertModel, - BertPreTrainedModel, - load_tf_weights_in_bert, - ) - - try: - if not is_tf_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_tf_bert import ( - TFBertEmbeddings, - TFBertForMaskedLM, - TFBertForMultipleChoice, - TFBertForNextSentencePrediction, - TFBertForPreTraining, - TFBertForQuestionAnswering, - TFBertForSequenceClassification, - TFBertForTokenClassification, - TFBertLMHeadModel, - TFBertMainLayer, - TFBertModel, - TFBertPreTrainedModel, - ) - - try: - if not is_tensorflow_text_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .tokenization_bert_tf import TFBertTokenizer - - try: - if not is_flax_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_flax_bert import ( - FlaxBertForCausalLM, - FlaxBertForMaskedLM, - FlaxBertForMultipleChoice, - FlaxBertForNextSentencePrediction, - FlaxBertForPreTraining, - FlaxBertForQuestionAnswering, - FlaxBertForSequenceClassification, - FlaxBertForTokenClassification, - FlaxBertModel, - FlaxBertPreTrainedModel, - ) - + from .configuration_bert import * + from .convert_bert_original_tf2_checkpoint_to_pytorch import * + from .convert_bert_original_tf_checkpoint_to_pytorch import * + from .convert_bert_pytorch_checkpoint_to_original_tf import * + from .convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch import * + from .modeling_bert import * + from .modeling_flax_bert import * + from .modeling_tf_bert import * + from .tokenization_bert import * + from .tokenization_bert_fast import * + from .tokenization_bert_tf import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/bert/configuration_bert.py b/src/transformers/models/bert/configuration_bert.py index 613cf6a11463c2..ea29fb81c435aa 100644 --- a/src/transformers/models/bert/configuration_bert.py +++ b/src/transformers/models/bert/configuration_bert.py @@ -149,3 +149,6 @@ def inputs(self) -> Mapping[str, Mapping[int, str]]: ("token_type_ids", dynamic_axis), ] ) + + +__all__ = ["BertConfig", "BertOnnxConfig"] diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index 6b05fa648158a6..0c53963cee7922 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -1325,6 +1325,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + **loss_kwargs, ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): @@ -1375,11 +1376,7 @@ def forward( lm_loss = None if labels is not None: - # we are doing next-token prediction; shift prediction scores and input ids by one - shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() - labels = labels[:, 1:].contiguous() - loss_fct = CrossEntropyLoss() - lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + lm_loss = self.loss_function(prediction_scores, labels, self.config.vocab_size, **loss_kwargs) if not return_dict: output = (prediction_scores,) + outputs[2:] @@ -1994,3 +1991,19 @@ def forward( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + + +__all__ = [ + "BertForMaskedLM", + "BertForMultipleChoice", + "BertForNextSentencePrediction", + "BertForPreTraining", + "BertForQuestionAnswering", + "BertForSequenceClassification", + "BertForTokenClassification", + "BertLayer", + "BertLMHeadModel", + "BertModel", + "BertPreTrainedModel", + "load_tf_weights_in_bert", +] diff --git a/src/transformers/models/bert/modeling_flax_bert.py b/src/transformers/models/bert/modeling_flax_bert.py index 772ea2bf12b2ee..83358c86bd280d 100644 --- a/src/transformers/models/bert/modeling_flax_bert.py +++ b/src/transformers/models/bert/modeling_flax_bert.py @@ -1711,3 +1711,17 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs): FlaxCausalLMOutputWithCrossAttentions, _CONFIG_FOR_DOC, ) + + +__all__ = [ + "FlaxBertForCausalLM", + "FlaxBertForMaskedLM", + "FlaxBertForMultipleChoice", + "FlaxBertForNextSentencePrediction", + "FlaxBertForPreTraining", + "FlaxBertForQuestionAnswering", + "FlaxBertForSequenceClassification", + "FlaxBertForTokenClassification", + "FlaxBertModel", + "FlaxBertPreTrainedModel", +] diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py index bb3281278adaa1..ce862194dc7787 100644 --- a/src/transformers/models/bert/modeling_tf_bert.py +++ b/src/transformers/models/bert/modeling_tf_bert.py @@ -2108,3 +2108,19 @@ def build(self, input_shape=None): if getattr(self, "qa_outputs", None) is not None: with tf.name_scope(self.qa_outputs.name): self.qa_outputs.build([None, None, self.config.hidden_size]) + + +__all__ = [ + "TFBertEmbeddings", + "TFBertForMaskedLM", + "TFBertForMultipleChoice", + "TFBertForNextSentencePrediction", + "TFBertForPreTraining", + "TFBertForQuestionAnswering", + "TFBertForSequenceClassification", + "TFBertForTokenClassification", + "TFBertLMHeadModel", + "TFBertMainLayer", + "TFBertModel", + "TFBertPreTrainedModel", +] diff --git a/src/transformers/models/bert/tokenization_bert.py b/src/transformers/models/bert/tokenization_bert.py index 07583b949661de..42d4dd94554d41 100644 --- a/src/transformers/models/bert/tokenization_bert.py +++ b/src/transformers/models/bert/tokenization_bert.py @@ -502,3 +502,6 @@ def tokenize(self, text): else: output_tokens.extend(sub_tokens) return output_tokens + + +__all__ = ["BasicTokenizer", "BertTokenizer", "WordpieceTokenizer"] diff --git a/src/transformers/models/bert/tokenization_bert_fast.py b/src/transformers/models/bert/tokenization_bert_fast.py index f4897772847029..4a89e6053b988f 100644 --- a/src/transformers/models/bert/tokenization_bert_fast.py +++ b/src/transformers/models/bert/tokenization_bert_fast.py @@ -170,3 +170,6 @@ def create_token_type_ids_from_sequences( def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: files = self._tokenizer.model.save(save_directory, name=filename_prefix) return tuple(files) + + +__all__ = ["BertTokenizerFast"] diff --git a/src/transformers/models/bert/tokenization_bert_tf.py b/src/transformers/models/bert/tokenization_bert_tf.py index ebf88eeac9bbe8..b1f49722fbdffa 100644 --- a/src/transformers/models/bert/tokenization_bert_tf.py +++ b/src/transformers/models/bert/tokenization_bert_tf.py @@ -252,3 +252,6 @@ def get_config(self): "sep_token_id": self.sep_token_id, "pad_token_id": self.pad_token_id, } + + +__all__ = ["TFBertTokenizer"] diff --git a/src/transformers/models/bert_generation/__init__.py b/src/transformers/models/bert_generation/__init__.py index 14cf8bb5879320..3f83b1f6e5bba3 100644 --- a/src/transformers/models/bert_generation/__init__.py +++ b/src/transformers/models/bert_generation/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,61 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import TYPE_CHECKING -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_torch_available - - -_import_structure = {"configuration_bert_generation": ["BertGenerationConfig"]} - -try: - if not is_sentencepiece_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["tokenization_bert_generation"] = ["BertGenerationTokenizer"] - -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_bert_generation"] = [ - "BertGenerationDecoder", - "BertGenerationEncoder", - "BertGenerationPreTrainedModel", - "load_tf_weights_in_bert_generation", - ] +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure if TYPE_CHECKING: - from .configuration_bert_generation import BertGenerationConfig - - try: - if not is_sentencepiece_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .tokenization_bert_generation import BertGenerationTokenizer - - try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_bert_generation import ( - BertGenerationDecoder, - BertGenerationEncoder, - BertGenerationPreTrainedModel, - load_tf_weights_in_bert_generation, - ) - + from .configuration_bert_generation import * + from .modeling_bert_generation import * + from .tokenization_bert_generation import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/bert_generation/configuration_bert_generation.py b/src/transformers/models/bert_generation/configuration_bert_generation.py index d1d1b51b6538e2..1abe7c1a1c44ab 100644 --- a/src/transformers/models/bert_generation/configuration_bert_generation.py +++ b/src/transformers/models/bert_generation/configuration_bert_generation.py @@ -122,3 +122,6 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.position_embedding_type = position_embedding_type self.use_cache = use_cache + + +__all__ = ["BertGenerationConfig"] diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py index db4a378577562a..aaf326aa2de8eb 100755 --- a/src/transformers/models/bert_generation/modeling_bert_generation.py +++ b/src/transformers/models/bert_generation/modeling_bert_generation.py @@ -785,9 +785,7 @@ def forward( # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask = None - if not use_cache: - extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] @@ -998,3 +996,11 @@ def _reorder_cache(self, past_key_values, beam_idx): tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), ) return reordered_past + + +__all__ = [ + "BertGenerationDecoder", + "BertGenerationEncoder", + "BertGenerationPreTrainedModel", + "load_tf_weights_in_bert_generation", +] diff --git a/src/transformers/models/bert_generation/tokenization_bert_generation.py b/src/transformers/models/bert_generation/tokenization_bert_generation.py index b1adb9b62b2551..31f046863c289c 100644 --- a/src/transformers/models/bert_generation/tokenization_bert_generation.py +++ b/src/transformers/models/bert_generation/tokenization_bert_generation.py @@ -170,3 +170,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = fi.write(content_spiece_model) return (out_vocab_file,) + + +__all__ = ["BertGenerationTokenizer"] diff --git a/src/transformers/models/bert_japanese/__init__.py b/src/transformers/models/bert_japanese/__init__.py index a569c3cc54bff8..f5296087db1d00 100644 --- a/src/transformers/models/bert_japanese/__init__.py +++ b/src/transformers/models/bert_japanese/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,19 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import TYPE_CHECKING from ...utils import _LazyModule - - -_import_structure = {"tokenization_bert_japanese": ["BertJapaneseTokenizer", "CharacterTokenizer", "MecabTokenizer"]} +from ...utils.import_utils import define_import_structure if TYPE_CHECKING: - from .tokenization_bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer - + from .tokenization_bert_japanese import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py index 732e9e7aff5741..8a841a3091623d 100644 --- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py +++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py @@ -977,3 +977,6 @@ def tokenize(self, text): new_pieces.append(piece) return new_pieces + + +__all__ = ["BertJapaneseTokenizer", "CharacterTokenizer", "MecabTokenizer"] diff --git a/src/transformers/models/bertweet/__init__.py b/src/transformers/models/bertweet/__init__.py index 42e4a23337c20c..432622f1595d1a 100644 --- a/src/transformers/models/bertweet/__init__.py +++ b/src/transformers/models/bertweet/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,19 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import TYPE_CHECKING from ...utils import _LazyModule - - -_import_structure = {"tokenization_bertweet": ["BertweetTokenizer"]} +from ...utils.import_utils import define_import_structure if TYPE_CHECKING: - from .tokenization_bertweet import BertweetTokenizer - + from .tokenization_bertweet import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/bertweet/tokenization_bertweet.py b/src/transformers/models/bertweet/tokenization_bertweet.py index f478dd0832b6e4..499238e5955fe0 100644 --- a/src/transformers/models/bertweet/tokenization_bertweet.py +++ b/src/transformers/models/bertweet/tokenization_bertweet.py @@ -764,3 +764,6 @@ def casual_tokenize(text, preserve_case=True, reduce_len=False, strip_handles=Fa ############################################################################### + + +__all__ = ["BertweetTokenizer"] diff --git a/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py b/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py index 5f972353c4f41e..d6640045b80c30 100644 --- a/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py +++ b/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py @@ -249,7 +249,7 @@ def convert_blip2_checkpoint( {"image": original_pixel_values, "text_input": [caption]}, match_head="itm" ) logits = hf_model( - pixel_values=original_pixel_values, + pixel_values=pixel_values, input_ids=input_ids, attention_mask=attention_mask, use_image_text_matching_head=True, @@ -274,7 +274,7 @@ def convert_blip2_checkpoint( {"image": original_pixel_values, "text_input": [caption]}, match_head="itc" ) logits = hf_model( - pixel_values=original_pixel_values, + pixel_values=pixel_values, input_ids=input_ids, attention_mask=attention_mask, use_image_text_matching_head=False, diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index d34528b7431453..ed8ddd3c47dea3 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -2307,12 +2307,14 @@ def generate( language_attention_mask = torch.ones( language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device ) + if input_ids is None: - input_ids = ( - torch.LongTensor([[self.config.text_config.bos_token_id]]) - .repeat(batch_size, 1) - .to(image_embeds.device) - ) + start_tokens = [self.config.text_config.bos_token_id] + if getattr(self.config, "image_token_index", None) is not None: + start_tokens = [self.config.image_token_index] * self.config.num_query_tokens + start_tokens + input_ids = torch.tensor([start_tokens], dtype=torch.long, device=image_embeds.device) + input_ids = input_ids.repeat(batch_size, 1) + inputs_embeds = self.get_input_embeddings()(input_ids) if attention_mask is None: attention_mask = torch.ones_like(input_ids) diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py index b0e9a4bbcb91bd..b3dd3446cd848e 100644 --- a/src/transformers/models/bloom/modeling_bloom.py +++ b/src/transformers/models/bloom/modeling_bloom.py @@ -911,7 +911,7 @@ def prepare_inputs_for_generation( # This part differs from other models because BLOOM needs a 2D mask to construct alibi tensor # The only difference is the usage of 2D instead of 4D mask, but the shape will be static if isinstance(past_key_values, StaticCache) and attention_mask is not None: - target_length = past_key_values.get_max_length() + target_length = past_key_values.get_max_cache_shape() batch_size, seq_length = attention_mask.shape diff = target_length - seq_length diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index 0661da8727996f..3255b6f44c05fb 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -25,7 +25,7 @@ from torch.nn import CrossEntropyLoss from ...activations import ACT2FN -from ...cache_utils import Cache, StaticCache +from ...cache_utils import Cache, DynamicCache, StaticCache from ...generation import GenerationMixin from ...modeling_attn_mask_utils import AttentionMaskConverter from ...modeling_flash_attention_utils import _flash_attention_forward @@ -1300,6 +1300,10 @@ def forward( if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) + # torch.jit.trace() doesn't support cache objects in the output + if use_cache and past_key_values is None and not torch.jit.is_tracing(): + past_key_values = DynamicCache() + if cache_position is None: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 cache_position = torch.arange( diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index b215fb6561bf81..d481d87e7ab8ed 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -890,7 +890,7 @@ def forward( all_self_attns = () if output_attentions else None next_decoder_cache = None - for decoder_layer in self.layers: + for decoder_layer in self.layers[: self.config.num_hidden_layers]: if output_hidden_states: all_hidden_states += (hidden_states,) @@ -1068,7 +1068,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( return causal_mask -# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with Llama->Cohere +# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with Llama->Cohere class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin): _tied_weights_keys = ["lm_head.weight"] diff --git a/src/transformers/models/deberta/configuration_deberta.py b/src/transformers/models/deberta/configuration_deberta.py index 1c826a784f3454..cfee176047e0ce 100644 --- a/src/transformers/models/deberta/configuration_deberta.py +++ b/src/transformers/models/deberta/configuration_deberta.py @@ -82,6 +82,9 @@ class DebertaConfig(PretrainedConfig): `["p2c", "c2p"]`. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. + legacy (`bool`, *optional*, defaults to `True`): + Whether or not the model should use the legacy `LegacyDebertaOnlyMLMHead`, which does not work properly + for mask infilling tasks. Example: @@ -121,6 +124,7 @@ def __init__( pos_att_type=None, pooler_dropout=0, pooler_hidden_act="gelu", + legacy=True, **kwargs, ): super().__init__(**kwargs) @@ -151,6 +155,7 @@ def __init__( self.pooler_hidden_size = kwargs.get("pooler_hidden_size", hidden_size) self.pooler_dropout = pooler_dropout self.pooler_hidden_act = pooler_hidden_act + self.legacy = legacy # Copied from transformers.models.deberta_v2.configuration_deberta_v2.DebertaV2OnnxConfig diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py index 814d3cb28521c0..6993121b6c1ebe 100644 --- a/src/transformers/models/deberta/modeling_deberta.py +++ b/src/transformers/models/deberta/modeling_deberta.py @@ -14,7 +14,6 @@ # limitations under the License. """PyTorch DeBERTa model.""" -from collections.abc import Sequence from typing import Optional, Tuple, Union import torch @@ -31,7 +30,6 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import softmax_backward_data from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_deberta import DebertaConfig @@ -53,206 +51,6 @@ _QA_TARGET_END_INDEX = 14 -class ContextPooler(nn.Module): - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size) - self.dropout = StableDropout(config.pooler_dropout) - self.config = config - - def forward(self, hidden_states): - # We "pool" the model by simply taking the hidden state corresponding - # to the first token. - - context_token = hidden_states[:, 0] - context_token = self.dropout(context_token) - pooled_output = self.dense(context_token) - pooled_output = ACT2FN[self.config.pooler_hidden_act](pooled_output) - return pooled_output - - @property - def output_dim(self): - return self.config.hidden_size - - -class XSoftmax(torch.autograd.Function): - """ - Masked Softmax which is optimized for saving memory - - Args: - input (`torch.tensor`): The input tensor that will apply softmax. - mask (`torch.IntTensor`): - The mask matrix where 0 indicate that element will be ignored in the softmax calculation. - dim (int): The dimension that will apply softmax - - Example: - - ```python - >>> import torch - >>> from transformers.models.deberta.modeling_deberta import XSoftmax - - >>> # Make a tensor - >>> x = torch.randn([4, 20, 100]) - - >>> # Create a mask - >>> mask = (x > 0).int() - - >>> # Specify the dimension to apply softmax - >>> dim = -1 - - >>> y = XSoftmax.apply(x, mask, dim) - ```""" - - @staticmethod - def forward(ctx, input, mask, dim): - ctx.dim = dim - rmask = ~(mask.to(torch.bool)) - - output = input.masked_fill(rmask, torch.tensor(torch.finfo(input.dtype).min)) - output = torch.softmax(output, ctx.dim) - output.masked_fill_(rmask, 0) - ctx.save_for_backward(output) - return output - - @staticmethod - def backward(ctx, grad_output): - (output,) = ctx.saved_tensors - inputGrad = softmax_backward_data(ctx, grad_output, output, ctx.dim, output) - return inputGrad, None, None - - @staticmethod - def symbolic(g, self, mask, dim): - import torch.onnx.symbolic_helper as sym_help - from torch.onnx.symbolic_opset9 import masked_fill, softmax - - mask_cast_value = g.op("Cast", mask, to_i=sym_help.cast_pytorch_to_onnx["Long"]) - r_mask = g.op( - "Cast", - g.op("Sub", g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64)), mask_cast_value), - to_i=sym_help.cast_pytorch_to_onnx["Bool"], - ) - output = masked_fill( - g, self, r_mask, g.op("Constant", value_t=torch.tensor(torch.finfo(self.type().dtype()).min)) - ) - output = softmax(g, output, dim) - return masked_fill(g, output, r_mask, g.op("Constant", value_t=torch.tensor(0, dtype=torch.bool))) - - -class DropoutContext: - def __init__(self): - self.dropout = 0 - self.mask = None - self.scale = 1 - self.reuse_mask = True - - -def get_mask(input, local_context): - if not isinstance(local_context, DropoutContext): - dropout = local_context - mask = None - else: - dropout = local_context.dropout - dropout *= local_context.scale - mask = local_context.mask if local_context.reuse_mask else None - - if dropout > 0 and mask is None: - mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).to(torch.bool) - - if isinstance(local_context, DropoutContext): - if local_context.mask is None: - local_context.mask = mask - - return mask, dropout - - -class XDropout(torch.autograd.Function): - """Optimized dropout function to save computation and memory by using mask operation instead of multiplication.""" - - @staticmethod - def forward(ctx, input, local_ctx): - mask, dropout = get_mask(input, local_ctx) - ctx.scale = 1.0 / (1 - dropout) - if dropout > 0: - ctx.save_for_backward(mask) - return input.masked_fill(mask, 0) * ctx.scale - else: - return input - - @staticmethod - def backward(ctx, grad_output): - if ctx.scale > 1: - (mask,) = ctx.saved_tensors - return grad_output.masked_fill(mask, 0) * ctx.scale, None - else: - return grad_output, None - - @staticmethod - def symbolic(g: torch._C.Graph, input: torch._C.Value, local_ctx: Union[float, DropoutContext]) -> torch._C.Value: - from torch.onnx import symbolic_opset12 - - dropout_p = local_ctx - if isinstance(local_ctx, DropoutContext): - dropout_p = local_ctx.dropout - # StableDropout only calls this function when training. - train = True - # TODO: We should check if the opset_version being used to export - # is > 12 here, but there's no good way to do that. As-is, if the - # opset_version < 12, export will fail with a CheckerError. - # Once https://github.com/pytorch/pytorch/issues/78391 is fixed, do something like: - # if opset_version < 12: - # return torch.onnx.symbolic_opset9.dropout(g, input, dropout_p, train) - return symbolic_opset12.dropout(g, input, dropout_p, train) - - -class StableDropout(nn.Module): - """ - Optimized dropout module for stabilizing the training - - Args: - drop_prob (float): the dropout probabilities - """ - - def __init__(self, drop_prob): - super().__init__() - self.drop_prob = drop_prob - self.count = 0 - self.context_stack = None - - def forward(self, x): - """ - Call the module - - Args: - x (`torch.tensor`): The input tensor to apply dropout - """ - if self.training and self.drop_prob > 0: - return XDropout.apply(x, self.get_context()) - return x - - def clear_context(self): - self.count = 0 - self.context_stack = None - - def init_context(self, reuse_mask=True, scale=1): - if self.context_stack is None: - self.context_stack = [] - self.count = 0 - for c in self.context_stack: - c.reuse_mask = reuse_mask - c.scale = scale - - def get_context(self): - if self.context_stack is not None: - if self.count >= len(self.context_stack): - self.context_stack.append(DropoutContext()) - ctx = self.context_stack[self.count] - ctx.dropout = self.drop_prob - self.count += 1 - return ctx - else: - return self.drop_prob - - class DebertaLayerNorm(nn.Module): """LayerNorm module in the TF style (epsilon inside the square root).""" @@ -278,74 +76,7 @@ def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.LayerNorm = DebertaLayerNorm(config.hidden_size, config.layer_norm_eps) - self.dropout = StableDropout(config.hidden_dropout_prob) - - def forward(self, hidden_states, input_tensor): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.LayerNorm(hidden_states + input_tensor) - return hidden_states - - -class DebertaAttention(nn.Module): - def __init__(self, config): - super().__init__() - self.self = DisentangledSelfAttention(config) - self.output = DebertaSelfOutput(config) - self.config = config - - def forward( - self, - hidden_states, - attention_mask, - output_attentions=False, - query_states=None, - relative_pos=None, - rel_embeddings=None, - ): - self_output = self.self( - hidden_states, - attention_mask, - output_attentions, - query_states=query_states, - relative_pos=relative_pos, - rel_embeddings=rel_embeddings, - ) - if output_attentions: - self_output, att_matrix = self_output - if query_states is None: - query_states = hidden_states - attention_output = self.output(self_output, query_states) - - if output_attentions: - return (attention_output, att_matrix) - else: - return attention_output - - -# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Deberta -class DebertaIntermediate(nn.Module): - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.intermediate_size) - if isinstance(config.hidden_act, str): - self.intermediate_act_fn = ACT2FN[config.hidden_act] - else: - self.intermediate_act_fn = config.hidden_act - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) - hidden_states = self.intermediate_act_fn(hidden_states) - return hidden_states - - -class DebertaOutput(nn.Module): - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.intermediate_size, config.hidden_size) - self.LayerNorm = DebertaLayerNorm(config.hidden_size, config.layer_norm_eps) - self.dropout = StableDropout(config.hidden_dropout_prob) - self.config = config + self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): hidden_states = self.dense(hidden_states) @@ -354,142 +85,8 @@ def forward(self, hidden_states, input_tensor): return hidden_states -class DebertaLayer(nn.Module): - def __init__(self, config): - super().__init__() - self.attention = DebertaAttention(config) - self.intermediate = DebertaIntermediate(config) - self.output = DebertaOutput(config) - - def forward( - self, - hidden_states, - attention_mask, - query_states=None, - relative_pos=None, - rel_embeddings=None, - output_attentions=False, - ): - attention_output = self.attention( - hidden_states, - attention_mask, - output_attentions=output_attentions, - query_states=query_states, - relative_pos=relative_pos, - rel_embeddings=rel_embeddings, - ) - if output_attentions: - attention_output, att_matrix = attention_output - intermediate_output = self.intermediate(attention_output) - layer_output = self.output(intermediate_output, attention_output) - if output_attentions: - return (layer_output, att_matrix) - else: - return layer_output - - -class DebertaEncoder(nn.Module): - """Modified BertEncoder with relative position bias support""" - - def __init__(self, config): - super().__init__() - self.layer = nn.ModuleList([DebertaLayer(config) for _ in range(config.num_hidden_layers)]) - self.relative_attention = getattr(config, "relative_attention", False) - if self.relative_attention: - self.max_relative_positions = getattr(config, "max_relative_positions", -1) - if self.max_relative_positions < 1: - self.max_relative_positions = config.max_position_embeddings - self.rel_embeddings = nn.Embedding(self.max_relative_positions * 2, config.hidden_size) - self.gradient_checkpointing = False - - def get_rel_embedding(self): - rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None - return rel_embeddings - - def get_attention_mask(self, attention_mask): - if attention_mask.dim() <= 2: - extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) - attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1) - elif attention_mask.dim() == 3: - attention_mask = attention_mask.unsqueeze(1) - - return attention_mask - - def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None): - if self.relative_attention and relative_pos is None: - q = query_states.size(-2) if query_states is not None else hidden_states.size(-2) - relative_pos = build_relative_position(q, hidden_states.size(-2), hidden_states.device) - return relative_pos - - def forward( - self, - hidden_states, - attention_mask, - output_hidden_states=True, - output_attentions=False, - query_states=None, - relative_pos=None, - return_dict=True, - ): - attention_mask = self.get_attention_mask(attention_mask) - relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos) - - all_hidden_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - - if isinstance(hidden_states, Sequence): - next_kv = hidden_states[0] - else: - next_kv = hidden_states - rel_embeddings = self.get_rel_embedding() - for i, layer_module in enumerate(self.layer): - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if self.gradient_checkpointing and self.training: - hidden_states = self._gradient_checkpointing_func( - layer_module.__call__, - next_kv, - attention_mask, - query_states, - relative_pos, - rel_embeddings, - output_attentions, - ) - else: - hidden_states = layer_module( - next_kv, - attention_mask, - query_states=query_states, - relative_pos=relative_pos, - rel_embeddings=rel_embeddings, - output_attentions=output_attentions, - ) - - if output_attentions: - hidden_states, att_m = hidden_states - - if query_states is not None: - query_states = hidden_states - if isinstance(hidden_states, Sequence): - next_kv = hidden_states[i + 1] if i + 1 < len(self.layer) else None - else: - next_kv = hidden_states - - if output_attentions: - all_attentions = all_attentions + (att_m,) - - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if not return_dict: - return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) - return BaseModelOutput( - last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions - ) - - -def build_relative_position(query_size, key_size, device): +@torch.jit.script +def build_relative_position(query_layer, key_layer): """ Build relative position according to the query and key @@ -506,8 +103,11 @@ def build_relative_position(query_size, key_size, device): """ - q_ids = torch.arange(query_size, dtype=torch.long, device=device) - k_ids = torch.arange(key_size, dtype=torch.long, device=device) + query_size = query_layer.size(-2) + key_size = key_layer.size(-2) + + q_ids = torch.arange(query_size, dtype=torch.long, device=query_layer.device) + k_ids = torch.arange(key_size, dtype=torch.long, device=key_layer.device) rel_pos_ids = q_ids[:, None] - k_ids.view(1, -1).repeat(query_size, 1) rel_pos_ids = rel_pos_ids[:query_size, :] rel_pos_ids = rel_pos_ids.unsqueeze(0) @@ -529,6 +129,39 @@ def pos_dynamic_expand(pos_index, p2c_att, key_layer): return pos_index.expand(p2c_att.size()[:2] + (pos_index.size(-2), key_layer.size(-2))) +###### To support a general trace, we have to define these operation as they use python objects (sizes) ################## +# which are not supported by torch.jit.trace. +# Full credits to @Szustarol +@torch.jit.script +def scaled_size_sqrt(query_layer: torch.Tensor, scale_factor: int): + return torch.sqrt(torch.tensor(query_layer.size(-1), dtype=torch.float) * scale_factor) + + +@torch.jit.script +def build_rpos(query_layer: torch.Tensor, key_layer: torch.Tensor, relative_pos): + if query_layer.size(-2) != key_layer.size(-2): + return build_relative_position(query_layer, key_layer) + else: + return relative_pos + + +@torch.jit.script +def compute_attention_span(query_layer: torch.Tensor, key_layer: torch.Tensor, max_relative_positions: int): + return torch.tensor(min(max(query_layer.size(-2), key_layer.size(-2)), max_relative_positions)) + + +@torch.jit.script +def uneven_size_corrected(p2c_att, query_layer: torch.Tensor, key_layer: torch.Tensor, relative_pos): + if query_layer.size(-2) != key_layer.size(-2): + pos_index = relative_pos[:, :, :, 0].unsqueeze(-1) + return torch.gather(p2c_att, dim=2, index=pos_dynamic_expand(pos_index, p2c_att, key_layer)) + else: + return p2c_att + + +######################################################################################################################## + + class DisentangledSelfAttention(nn.Module): """ Disentangled self-attention module @@ -561,19 +194,22 @@ def __init__(self, config): if self.talking_head: self.head_logits_proj = nn.Linear(config.num_attention_heads, config.num_attention_heads, bias=False) self.head_weights_proj = nn.Linear(config.num_attention_heads, config.num_attention_heads, bias=False) + else: + self.head_logits_proj = None + self.head_weights_proj = None if self.relative_attention: self.max_relative_positions = getattr(config, "max_relative_positions", -1) if self.max_relative_positions < 1: self.max_relative_positions = config.max_position_embeddings - self.pos_dropout = StableDropout(config.hidden_dropout_prob) + self.pos_dropout = nn.Dropout(config.hidden_dropout_prob) if "c2p" in self.pos_att_type: self.pos_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=False) if "p2c" in self.pos_att_type: self.pos_q_proj = nn.Linear(config.hidden_size, self.all_head_size) - self.dropout = StableDropout(config.attention_probs_dropout_prob) + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) def transpose_for_scores(self, x): new_x_shape = x.size()[:-1] + (self.num_attention_heads, -1) @@ -582,13 +218,13 @@ def transpose_for_scores(self, x): def forward( self, - hidden_states, - attention_mask, - output_attentions=False, - query_states=None, - relative_pos=None, - rel_embeddings=None, - ): + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + output_attentions: bool = False, + query_states: Optional[torch.Tensor] = None, + relative_pos: Optional[torch.Tensor] = None, + rel_embeddings: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: """ Call the module @@ -622,31 +258,24 @@ def forward( qp = self.in_proj(hidden_states) # .split(self.all_head_size, dim=-1) query_layer, key_layer, value_layer = self.transpose_for_scores(qp).chunk(3, dim=-1) else: - - def linear(w, b, x): - if b is not None: - return torch.matmul(x, w.t()) + b.t() - else: - return torch.matmul(x, w.t()) # + b.t() - ws = self.in_proj.weight.chunk(self.num_attention_heads * 3, dim=0) qkvw = [torch.cat([ws[i * 3 + k] for i in range(self.num_attention_heads)], dim=0) for k in range(3)] - qkvb = [None] * 3 - - q = linear(qkvw[0], qkvb[0], query_states.to(dtype=qkvw[0].dtype)) - k, v = [linear(qkvw[i], qkvb[i], hidden_states.to(dtype=qkvw[i].dtype)) for i in range(1, 3)] + q = torch.matmul(qkvw[0], query_states.t().to(dtype=qkvw[0].dtype)) + k = torch.matmul(qkvw[1], hidden_states.t().to(dtype=qkvw[1].dtype)) + v = torch.matmul(qkvw[2], hidden_states.t().to(dtype=qkvw[2].dtype)) query_layer, key_layer, value_layer = [self.transpose_for_scores(x) for x in [q, k, v]] query_layer = query_layer + self.transpose_for_scores(self.q_bias[None, None, :]) value_layer = value_layer + self.transpose_for_scores(self.v_bias[None, None, :]) - rel_att = None + rel_att: int = 0 # Take the dot product between "query" and "key" to get the raw attention scores. scale_factor = 1 + len(self.pos_att_type) - scale = torch.sqrt(torch.tensor(query_layer.size(-1), dtype=torch.float) * scale_factor) + scale = scaled_size_sqrt(query_layer, scale_factor) query_layer = query_layer / scale.to(dtype=query_layer.dtype) attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) - if self.relative_attention: + + if self.relative_attention and rel_embeddings is not None and relative_pos is not None: rel_embeddings = self.pos_dropout(rel_embeddings) rel_att = self.disentangled_att_bias(query_layer, key_layer, relative_pos, rel_embeddings, scale_factor) @@ -654,27 +283,37 @@ def linear(w, b, x): attention_scores = attention_scores + rel_att # bxhxlxd - if self.talking_head: + if self.head_logits_proj is not None: attention_scores = self.head_logits_proj(attention_scores.permute(0, 2, 3, 1)).permute(0, 3, 1, 2) - attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1) + attention_mask = attention_mask.bool() + attention_scores = attention_scores.masked_fill(~(attention_mask), torch.finfo(query_layer.dtype).min) + # bsz x height x length x dimension + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + attention_probs.masked_fill(attention_mask, 0) + attention_probs = self.dropout(attention_probs) - if self.talking_head: + if self.head_weights_proj is not None: attention_probs = self.head_weights_proj(attention_probs.permute(0, 2, 3, 1)).permute(0, 3, 1, 2) context_layer = torch.matmul(attention_probs, value_layer) context_layer = context_layer.permute(0, 2, 1, 3).contiguous() new_context_layer_shape = context_layer.size()[:-2] + (-1,) context_layer = context_layer.view(new_context_layer_shape) - if output_attentions: - return (context_layer, attention_probs) - else: - return context_layer + if not output_attentions: + return (context_layer, None) + return (context_layer, attention_probs) - def disentangled_att_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor): + def disentangled_att_bias( + self, + query_layer: torch.Tensor, + key_layer: torch.Tensor, + relative_pos: torch.Tensor, + rel_embeddings: torch.Tensor, + scale_factor: int, + ): if relative_pos is None: - q = query_layer.size(-2) - relative_pos = build_relative_position(q, key_layer.size(-2), query_layer.device) + relative_pos = build_relative_position(query_layer, key_layer, query_layer.device) if relative_pos.dim() == 2: relative_pos = relative_pos.unsqueeze(0).unsqueeze(0) elif relative_pos.dim() == 3: @@ -683,8 +322,8 @@ def disentangled_att_bias(self, query_layer, key_layer, relative_pos, rel_embedd elif relative_pos.dim() != 4: raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.dim()}") - att_span = min(max(query_layer.size(-2), key_layer.size(-2)), self.max_relative_positions) - relative_pos = relative_pos.long().to(query_layer.device) + att_span = compute_attention_span(query_layer, key_layer, self.max_relative_positions) + relative_pos = relative_pos.long() rel_embeddings = rel_embeddings[ self.max_relative_positions - att_span : self.max_relative_positions + att_span, : ].unsqueeze(0) @@ -704,20 +343,19 @@ def disentangled_att_bias(self, query_layer, key_layer, relative_pos, rel_embedd if "p2c" in self.pos_att_type: pos_query_layer = self.pos_q_proj(rel_embeddings) pos_query_layer = self.transpose_for_scores(pos_query_layer) - pos_query_layer /= torch.sqrt(torch.tensor(pos_query_layer.size(-1), dtype=torch.float) * scale_factor) - if query_layer.size(-2) != key_layer.size(-2): - r_pos = build_relative_position(key_layer.size(-2), key_layer.size(-2), query_layer.device) - else: - r_pos = relative_pos + pos_query_layer /= scaled_size_sqrt(pos_query_layer, scale_factor) + r_pos = build_rpos( + query_layer, + key_layer, + relative_pos, + ) p2c_pos = torch.clamp(-r_pos + att_span, 0, att_span * 2 - 1) p2c_att = torch.matmul(key_layer, pos_query_layer.transpose(-1, -2).to(dtype=key_layer.dtype)) p2c_att = torch.gather( p2c_att, dim=-1, index=p2c_dynamic_expand(p2c_pos, query_layer, key_layer) ).transpose(-1, -2) - if query_layer.size(-2) != key_layer.size(-2): - pos_index = relative_pos[:, :, :, 0].unsqueeze(-1) - p2c_att = torch.gather(p2c_att, dim=-2, index=pos_dynamic_expand(pos_index, p2c_att, key_layer)) + p2c_att = uneven_size_corrected(p2c_att, query_layer, key_layer, relative_pos) score += p2c_att return score @@ -732,71 +370,267 @@ def __init__(self, config): self.embedding_size = getattr(config, "embedding_size", config.hidden_size) self.word_embeddings = nn.Embedding(config.vocab_size, self.embedding_size, padding_idx=pad_token_id) - self.position_biased_input = getattr(config, "position_biased_input", True) - if not self.position_biased_input: - self.position_embeddings = None + self.position_biased_input = getattr(config, "position_biased_input", True) + if not self.position_biased_input: + self.position_embeddings = None + else: + self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.embedding_size) + + if config.type_vocab_size > 0: + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, self.embedding_size) + else: + self.token_type_embeddings = None + + if self.embedding_size != config.hidden_size: + self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False) + else: + self.embed_proj = None + + self.LayerNorm = DebertaLayerNorm(config.hidden_size, config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.config = config + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + + def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + if self.position_embeddings is not None: + position_embeddings = self.position_embeddings(position_ids.long()) + else: + position_embeddings = torch.zeros_like(inputs_embeds) + + embeddings = inputs_embeds + if self.position_biased_input: + embeddings += position_embeddings + if self.token_type_embeddings is not None: + token_type_embeddings = self.token_type_embeddings(token_type_ids) + embeddings += token_type_embeddings + + if self.embed_proj is not None: + embeddings = self.embed_proj(embeddings) + + embeddings = self.LayerNorm(embeddings) + + if mask is not None: + if mask.dim() != embeddings.dim(): + if mask.dim() == 4: + mask = mask.squeeze(1).squeeze(1) + mask = mask.unsqueeze(2) + mask = mask.to(embeddings.dtype) + + embeddings = embeddings * mask + + embeddings = self.dropout(embeddings) + return embeddings + + +class DebertaAttention(nn.Module): + def __init__(self, config): + super().__init__() + self.self = DisentangledSelfAttention(config) + self.output = DebertaSelfOutput(config) + self.config = config + + def forward( + self, + hidden_states, + attention_mask, + output_attentions: bool = False, + query_states=None, + relative_pos=None, + rel_embeddings=None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + self_output, att_matrix = self.self( + hidden_states, + attention_mask, + output_attentions, + query_states=query_states, + relative_pos=relative_pos, + rel_embeddings=rel_embeddings, + ) + if query_states is None: + query_states = hidden_states + attention_output = self.output(self_output, query_states) + + if output_attentions: + return (attention_output, att_matrix) + else: + return (attention_output, None) + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Deberta +class DebertaIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class DebertaOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = DebertaLayerNorm(config.hidden_size, config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.config = config + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class DebertaLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.attention = DebertaAttention(config) + self.intermediate = DebertaIntermediate(config) + self.output = DebertaOutput(config) + + def forward( + self, + hidden_states, + attention_mask, + query_states=None, + relative_pos=None, + rel_embeddings=None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + attention_output, att_matrix = self.attention( + hidden_states, + attention_mask, + output_attentions=output_attentions, + query_states=query_states, + relative_pos=relative_pos, + rel_embeddings=rel_embeddings, + ) + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + + if output_attentions: + return (layer_output, att_matrix) else: - self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.embedding_size) + return (layer_output, None) - if config.type_vocab_size > 0: - self.token_type_embeddings = nn.Embedding(config.type_vocab_size, self.embedding_size) - if self.embedding_size != config.hidden_size: - self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False) - self.LayerNorm = DebertaLayerNorm(config.hidden_size, config.layer_norm_eps) - self.dropout = StableDropout(config.hidden_dropout_prob) - self.config = config +class DebertaEncoder(PreTrainedModel): + """Modified BertEncoder with relative position bias support""" - # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer( - "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False - ) + def __init__(self, config): + super().__init__(config) + self.layer = nn.ModuleList([DebertaLayer(config) for _ in range(config.num_hidden_layers)]) + self.relative_attention = getattr(config, "relative_attention", False) + if self.relative_attention: + self.max_relative_positions = getattr(config, "max_relative_positions", -1) + if self.max_relative_positions < 1: + self.max_relative_positions = config.max_position_embeddings + self.rel_embeddings = nn.Embedding(self.max_relative_positions * 2, config.hidden_size) + self.gradient_checkpointing = False - def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None): - if input_ids is not None: - input_shape = input_ids.size() - else: - input_shape = inputs_embeds.size()[:-1] + def get_rel_embedding(self): + rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None + return rel_embeddings - seq_length = input_shape[1] + def get_attention_mask(self, attention_mask): + if attention_mask.dim() <= 2: + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1) + elif attention_mask.dim() == 3: + attention_mask = attention_mask.unsqueeze(1) - if position_ids is None: - position_ids = self.position_ids[:, :seq_length] + return attention_mask - if token_type_ids is None: - token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None): + if self.relative_attention and relative_pos is None: + if query_states is not None: + relative_pos = build_relative_position(query_states, hidden_states) + else: + relative_pos = build_relative_position(hidden_states, hidden_states) + return relative_pos - if inputs_embeds is None: - inputs_embeds = self.word_embeddings(input_ids) + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + output_hidden_states: bool = True, + output_attentions: bool = False, + query_states=None, + relative_pos=None, + return_dict: bool = True, + ): + attention_mask = self.get_attention_mask(attention_mask) + relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos) - if self.position_embeddings is not None: - position_embeddings = self.position_embeddings(position_ids.long()) - else: - position_embeddings = torch.zeros_like(inputs_embeds) + all_hidden_states: Optional[Tuple[torch.Tensor]] = (hidden_states,) if output_hidden_states else None + all_attentions = () if output_attentions else None - embeddings = inputs_embeds - if self.position_biased_input: - embeddings += position_embeddings - if self.config.type_vocab_size > 0: - token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings += token_type_embeddings + next_kv = hidden_states - if self.embedding_size != self.config.hidden_size: - embeddings = self.embed_proj(embeddings) + rel_embeddings = self.get_rel_embedding() + for i, layer_module in enumerate(self.layer): + if self.gradient_checkpointing and self.training: + hidden_states, att_m = self._gradient_checkpointing_func( + layer_module.__call__, + next_kv, + attention_mask, + query_states, + relative_pos, + rel_embeddings, + output_attentions, + ) + else: + hidden_states, att_m = layer_module( + next_kv, + attention_mask, + query_states=query_states, + relative_pos=relative_pos, + rel_embeddings=rel_embeddings, + output_attentions=output_attentions, + ) - embeddings = self.LayerNorm(embeddings) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) - if mask is not None: - if mask.dim() != embeddings.dim(): - if mask.dim() == 4: - mask = mask.squeeze(1).squeeze(1) - mask = mask.unsqueeze(2) - mask = mask.to(embeddings.dtype) + if query_states is not None: + query_states = hidden_states + else: + next_kv = hidden_states - embeddings = embeddings * mask + if output_attentions: + all_attentions = all_attentions + (att_m,) - embeddings = self.dropout(embeddings) - return embeddings + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) class DebertaPreTrainedModel(PreTrainedModel): @@ -1000,25 +834,128 @@ def forward( ) +class LegacyDebertaPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.embedding_size = getattr(config, "embedding_size", config.hidden_size) + + self.dense = nn.Linear(config.hidden_size, self.embedding_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(self.embedding_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class LegacyDebertaLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = LegacyDebertaPredictionHeadTransform(config) + + self.embedding_size = getattr(config, "embedding_size", config.hidden_size) + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(self.embedding_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def _tie_weights(self): + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->LegacyDeberta +class LegacyDebertaOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = LegacyDebertaLMPredictionHead(config) + + def forward(self, sequence_output: torch.Tensor) -> torch.Tensor: + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class DebertaLMPredictionHead(nn.Module): + """https://github.com/microsoft/DeBERTa/blob/master/DeBERTa/deberta/bert.py#L270""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=True) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # note that the input embeddings must be passed as an argument + def forward(self, hidden_states, word_embeddings): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm( + hidden_states + ) # original used MaskedLayerNorm, but passed no mask. This is equivalent. + hidden_states = torch.matmul(hidden_states, word_embeddings.weight.t()) + self.bias + return hidden_states + + +class DebertaOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.lm_head = DebertaLMPredictionHead(config) + + # note that the input embeddings must be passed as an argument + def forward(self, sequence_output, word_embeddings): + prediction_scores = self.lm_head(sequence_output, word_embeddings) + return prediction_scores + + @add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING) class DebertaForMaskedLM(DebertaPreTrainedModel): _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): super().__init__(config) - + self.legacy = config.legacy self.deberta = DebertaModel(config) - self.cls = DebertaOnlyMLMHead(config) + if self.legacy: + self.cls = LegacyDebertaOnlyMLMHead(config) + else: + self._tied_weights_keys = ["lm_predictions.lm_head.weight", "deberta.embeddings.word_embeddings.weight"] + self.lm_predictions = DebertaOnlyMLMHead(config) # Initialize weights and apply final processing self.post_init() def get_output_embeddings(self): - return self.cls.predictions.decoder + if self.legacy: + return self.cls.predictions.decoder + else: + return self.lm_predictions.lm_head.dense def set_output_embeddings(self, new_embeddings): - self.cls.predictions.decoder = new_embeddings - self.cls.predictions.bias = new_embeddings.bias + if self.legacy: + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + else: + self.lm_predictions.lm_head.dense = new_embeddings + self.lm_predictions.lm_head.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1062,7 +999,10 @@ def forward( ) sequence_output = outputs[0] - prediction_scores = self.cls(sequence_output) + if self.legacy: + prediction_scores = self.cls(sequence_output) + else: + prediction_scores = self.lm_predictions(sequence_output, self.deberta.embeddings.word_embeddings) masked_lm_loss = None if labels is not None: @@ -1081,58 +1021,26 @@ def forward( ) -class DebertaPredictionHeadTransform(nn.Module): - def __init__(self, config): - super().__init__() - self.embedding_size = getattr(config, "embedding_size", config.hidden_size) - - self.dense = nn.Linear(config.hidden_size, self.embedding_size) - if isinstance(config.hidden_act, str): - self.transform_act_fn = ACT2FN[config.hidden_act] - else: - self.transform_act_fn = config.hidden_act - self.LayerNorm = nn.LayerNorm(self.embedding_size, eps=config.layer_norm_eps) - - def forward(self, hidden_states): - hidden_states = self.dense(hidden_states) - hidden_states = self.transform_act_fn(hidden_states) - hidden_states = self.LayerNorm(hidden_states) - return hidden_states - - -class DebertaLMPredictionHead(nn.Module): +class ContextPooler(nn.Module): def __init__(self, config): super().__init__() - self.transform = DebertaPredictionHeadTransform(config) - - self.embedding_size = getattr(config, "embedding_size", config.hidden_size) - # The output weights are the same as the input embeddings, but there is - # an output-only bias for each token. - self.decoder = nn.Linear(self.embedding_size, config.vocab_size, bias=False) - - self.bias = nn.Parameter(torch.zeros(config.vocab_size)) - - # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` - self.decoder.bias = self.bias - - def _tie_weights(self): - self.decoder.bias = self.bias + self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size) + self.dropout = nn.Dropout(config.pooler_dropout) + self.config = config def forward(self, hidden_states): - hidden_states = self.transform(hidden_states) - hidden_states = self.decoder(hidden_states) - return hidden_states - + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. -# copied from transformers.models.bert.BertOnlyMLMHead with bert -> deberta -class DebertaOnlyMLMHead(nn.Module): - def __init__(self, config): - super().__init__() - self.predictions = DebertaLMPredictionHead(config) + context_token = hidden_states[:, 0] + context_token = self.dropout(context_token) + pooled_output = self.dense(context_token) + pooled_output = ACT2FN[self.config.pooler_hidden_act](pooled_output) + return pooled_output - def forward(self, sequence_output): - prediction_scores = self.predictions(sequence_output) - return prediction_scores + @property + def output_dim(self): + return self.config.hidden_size @add_start_docstrings( @@ -1156,7 +1064,7 @@ def __init__(self, config): self.classifier = nn.Linear(output_dim, num_labels) drop_out = getattr(config, "cls_dropout", None) drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out - self.dropout = StableDropout(drop_out) + self.dropout = nn.Dropout(drop_out) # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/deberta_v2/configuration_deberta_v2.py b/src/transformers/models/deberta_v2/configuration_deberta_v2.py index 80ab012411782b..cf3f61033c3285 100644 --- a/src/transformers/models/deberta_v2/configuration_deberta_v2.py +++ b/src/transformers/models/deberta_v2/configuration_deberta_v2.py @@ -82,6 +82,9 @@ class DebertaV2Config(PretrainedConfig): `["p2c", "c2p"]`, `["p2c", "c2p"]`. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. + legacy (`bool`, *optional*, defaults to `True`): + Whether or not the model should use the legacy `LegacyDebertaOnlyMLMHead`, which does not work properly + for mask infilling tasks. Example: @@ -121,6 +124,7 @@ def __init__( pos_att_type=None, pooler_dropout=0, pooler_hidden_act="gelu", + legacy=True, **kwargs, ): super().__init__(**kwargs) @@ -151,6 +155,7 @@ def __init__( self.pooler_hidden_size = kwargs.get("pooler_hidden_size", hidden_size) self.pooler_dropout = pooler_dropout self.pooler_hidden_act = pooler_hidden_act + self.legacy = legacy class DebertaV2OnnxConfig(OnnxConfig): diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index f47cb86ab52acb..6645c1de832e12 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -32,7 +32,6 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import softmax_backward_data from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_deberta_v2 import DebertaV2Config @@ -45,501 +44,23 @@ _QA_TARGET_END_INDEX = 9 -# Copied from transformers.models.deberta.modeling_deberta.ContextPooler -class ContextPooler(nn.Module): - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size) - self.dropout = StableDropout(config.pooler_dropout) - self.config = config - - def forward(self, hidden_states): - # We "pool" the model by simply taking the hidden state corresponding - # to the first token. - - context_token = hidden_states[:, 0] - context_token = self.dropout(context_token) - pooled_output = self.dense(context_token) - pooled_output = ACT2FN[self.config.pooler_hidden_act](pooled_output) - return pooled_output - - @property - def output_dim(self): - return self.config.hidden_size - - -# Copied from transformers.models.deberta.modeling_deberta.XSoftmax with deberta->deberta_v2 -class XSoftmax(torch.autograd.Function): - """ - Masked Softmax which is optimized for saving memory - - Args: - input (`torch.tensor`): The input tensor that will apply softmax. - mask (`torch.IntTensor`): - The mask matrix where 0 indicate that element will be ignored in the softmax calculation. - dim (int): The dimension that will apply softmax - - Example: - - ```python - >>> import torch - >>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax - - >>> # Make a tensor - >>> x = torch.randn([4, 20, 100]) - - >>> # Create a mask - >>> mask = (x > 0).int() - - >>> # Specify the dimension to apply softmax - >>> dim = -1 - - >>> y = XSoftmax.apply(x, mask, dim) - ```""" - - @staticmethod - def forward(ctx, input, mask, dim): - ctx.dim = dim - rmask = ~(mask.to(torch.bool)) - - output = input.masked_fill(rmask, torch.tensor(torch.finfo(input.dtype).min)) - output = torch.softmax(output, ctx.dim) - output.masked_fill_(rmask, 0) - ctx.save_for_backward(output) - return output - - @staticmethod - def backward(ctx, grad_output): - (output,) = ctx.saved_tensors - inputGrad = softmax_backward_data(ctx, grad_output, output, ctx.dim, output) - return inputGrad, None, None - - @staticmethod - def symbolic(g, self, mask, dim): - import torch.onnx.symbolic_helper as sym_help - from torch.onnx.symbolic_opset9 import masked_fill, softmax - - mask_cast_value = g.op("Cast", mask, to_i=sym_help.cast_pytorch_to_onnx["Long"]) - r_mask = g.op( - "Cast", - g.op("Sub", g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64)), mask_cast_value), - to_i=sym_help.cast_pytorch_to_onnx["Bool"], - ) - output = masked_fill( - g, self, r_mask, g.op("Constant", value_t=torch.tensor(torch.finfo(self.type().dtype()).min)) - ) - output = softmax(g, output, dim) - return masked_fill(g, output, r_mask, g.op("Constant", value_t=torch.tensor(0, dtype=torch.bool))) - - -# Copied from transformers.models.deberta.modeling_deberta.DropoutContext -class DropoutContext: - def __init__(self): - self.dropout = 0 - self.mask = None - self.scale = 1 - self.reuse_mask = True - - -# Copied from transformers.models.deberta.modeling_deberta.get_mask -def get_mask(input, local_context): - if not isinstance(local_context, DropoutContext): - dropout = local_context - mask = None - else: - dropout = local_context.dropout - dropout *= local_context.scale - mask = local_context.mask if local_context.reuse_mask else None - - if dropout > 0 and mask is None: - mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).to(torch.bool) - - if isinstance(local_context, DropoutContext): - if local_context.mask is None: - local_context.mask = mask - - return mask, dropout - - -# Copied from transformers.models.deberta.modeling_deberta.XDropout -class XDropout(torch.autograd.Function): - """Optimized dropout function to save computation and memory by using mask operation instead of multiplication.""" - - @staticmethod - def forward(ctx, input, local_ctx): - mask, dropout = get_mask(input, local_ctx) - ctx.scale = 1.0 / (1 - dropout) - if dropout > 0: - ctx.save_for_backward(mask) - return input.masked_fill(mask, 0) * ctx.scale - else: - return input - - @staticmethod - def backward(ctx, grad_output): - if ctx.scale > 1: - (mask,) = ctx.saved_tensors - return grad_output.masked_fill(mask, 0) * ctx.scale, None - else: - return grad_output, None - - @staticmethod - def symbolic(g: torch._C.Graph, input: torch._C.Value, local_ctx: Union[float, DropoutContext]) -> torch._C.Value: - from torch.onnx import symbolic_opset12 - - dropout_p = local_ctx - if isinstance(local_ctx, DropoutContext): - dropout_p = local_ctx.dropout - # StableDropout only calls this function when training. - train = True - # TODO: We should check if the opset_version being used to export - # is > 12 here, but there's no good way to do that. As-is, if the - # opset_version < 12, export will fail with a CheckerError. - # Once https://github.com/pytorch/pytorch/issues/78391 is fixed, do something like: - # if opset_version < 12: - # return torch.onnx.symbolic_opset9.dropout(g, input, dropout_p, train) - return symbolic_opset12.dropout(g, input, dropout_p, train) - - -# Copied from transformers.models.deberta.modeling_deberta.StableDropout -class StableDropout(nn.Module): - """ - Optimized dropout module for stabilizing the training - - Args: - drop_prob (float): the dropout probabilities - """ - - def __init__(self, drop_prob): - super().__init__() - self.drop_prob = drop_prob - self.count = 0 - self.context_stack = None - - def forward(self, x): - """ - Call the module - - Args: - x (`torch.tensor`): The input tensor to apply dropout - """ - if self.training and self.drop_prob > 0: - return XDropout.apply(x, self.get_context()) - return x - - def clear_context(self): - self.count = 0 - self.context_stack = None - - def init_context(self, reuse_mask=True, scale=1): - if self.context_stack is None: - self.context_stack = [] - self.count = 0 - for c in self.context_stack: - c.reuse_mask = reuse_mask - c.scale = scale - - def get_context(self): - if self.context_stack is not None: - if self.count >= len(self.context_stack): - self.context_stack.append(DropoutContext()) - ctx = self.context_stack[self.count] - ctx.dropout = self.drop_prob - self.count += 1 - return ctx - else: - return self.drop_prob - - -# Copied from transformers.models.deberta.modeling_deberta.DebertaSelfOutput with DebertaLayerNorm->LayerNorm -class DebertaV2SelfOutput(nn.Module): - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) - self.dropout = StableDropout(config.hidden_dropout_prob) - - def forward(self, hidden_states, input_tensor): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.LayerNorm(hidden_states + input_tensor) - return hidden_states - - -# Copied from transformers.models.deberta.modeling_deberta.DebertaAttention with Deberta->DebertaV2 -class DebertaV2Attention(nn.Module): - def __init__(self, config): - super().__init__() - self.self = DisentangledSelfAttention(config) - self.output = DebertaV2SelfOutput(config) - self.config = config - - def forward( - self, - hidden_states, - attention_mask, - output_attentions=False, - query_states=None, - relative_pos=None, - rel_embeddings=None, - ): - self_output = self.self( - hidden_states, - attention_mask, - output_attentions, - query_states=query_states, - relative_pos=relative_pos, - rel_embeddings=rel_embeddings, - ) - if output_attentions: - self_output, att_matrix = self_output - if query_states is None: - query_states = hidden_states - attention_output = self.output(self_output, query_states) - - if output_attentions: - return (attention_output, att_matrix) - else: - return attention_output - - -# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->DebertaV2 -class DebertaV2Intermediate(nn.Module): - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.hidden_size, config.intermediate_size) - if isinstance(config.hidden_act, str): - self.intermediate_act_fn = ACT2FN[config.hidden_act] - else: - self.intermediate_act_fn = config.hidden_act - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) - hidden_states = self.intermediate_act_fn(hidden_states) - return hidden_states - - -# Copied from transformers.models.deberta.modeling_deberta.DebertaOutput with DebertaLayerNorm->LayerNorm -class DebertaV2Output(nn.Module): - def __init__(self, config): - super().__init__() - self.dense = nn.Linear(config.intermediate_size, config.hidden_size) - self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) - self.dropout = StableDropout(config.hidden_dropout_prob) - self.config = config - - def forward(self, hidden_states, input_tensor): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.LayerNorm(hidden_states + input_tensor) - return hidden_states - - -# Copied from transformers.models.deberta.modeling_deberta.DebertaLayer with Deberta->DebertaV2 -class DebertaV2Layer(nn.Module): - def __init__(self, config): - super().__init__() - self.attention = DebertaV2Attention(config) - self.intermediate = DebertaV2Intermediate(config) - self.output = DebertaV2Output(config) - - def forward( - self, - hidden_states, - attention_mask, - query_states=None, - relative_pos=None, - rel_embeddings=None, - output_attentions=False, - ): - attention_output = self.attention( - hidden_states, - attention_mask, - output_attentions=output_attentions, - query_states=query_states, - relative_pos=relative_pos, - rel_embeddings=rel_embeddings, - ) - if output_attentions: - attention_output, att_matrix = attention_output - intermediate_output = self.intermediate(attention_output) - layer_output = self.output(intermediate_output, attention_output) - if output_attentions: - return (layer_output, att_matrix) - else: - return layer_output - - -class ConvLayer(nn.Module): - def __init__(self, config): - super().__init__() - kernel_size = getattr(config, "conv_kernel_size", 3) - groups = getattr(config, "conv_groups", 1) - self.conv_act = getattr(config, "conv_act", "tanh") - self.conv = nn.Conv1d( - config.hidden_size, config.hidden_size, kernel_size, padding=(kernel_size - 1) // 2, groups=groups - ) - self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) - self.dropout = StableDropout(config.hidden_dropout_prob) - self.config = config - - def forward(self, hidden_states, residual_states, input_mask): - out = self.conv(hidden_states.permute(0, 2, 1).contiguous()).permute(0, 2, 1).contiguous() - rmask = (1 - input_mask).bool() - out.masked_fill_(rmask.unsqueeze(-1).expand(out.size()), 0) - out = ACT2FN[self.conv_act](self.dropout(out)) - - layer_norm_input = residual_states + out - output = self.LayerNorm(layer_norm_input).to(layer_norm_input) - - if input_mask is None: - output_states = output - else: - if input_mask.dim() != layer_norm_input.dim(): - if input_mask.dim() == 4: - input_mask = input_mask.squeeze(1).squeeze(1) - input_mask = input_mask.unsqueeze(2) - - input_mask = input_mask.to(output.dtype) - output_states = output * input_mask - - return output_states - - -class DebertaV2Encoder(nn.Module): - """Modified BertEncoder with relative position bias support""" - - def __init__(self, config): - super().__init__() - - self.layer = nn.ModuleList([DebertaV2Layer(config) for _ in range(config.num_hidden_layers)]) - self.relative_attention = getattr(config, "relative_attention", False) - - if self.relative_attention: - self.max_relative_positions = getattr(config, "max_relative_positions", -1) - if self.max_relative_positions < 1: - self.max_relative_positions = config.max_position_embeddings - - self.position_buckets = getattr(config, "position_buckets", -1) - pos_ebd_size = self.max_relative_positions * 2 - - if self.position_buckets > 0: - pos_ebd_size = self.position_buckets * 2 - - self.rel_embeddings = nn.Embedding(pos_ebd_size, config.hidden_size) - - self.norm_rel_ebd = [x.strip() for x in getattr(config, "norm_rel_ebd", "none").lower().split("|")] - - if "layer_norm" in self.norm_rel_ebd: - self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=True) - - self.conv = ConvLayer(config) if getattr(config, "conv_kernel_size", 0) > 0 else None - self.gradient_checkpointing = False - - def get_rel_embedding(self): - rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None - if rel_embeddings is not None and ("layer_norm" in self.norm_rel_ebd): - rel_embeddings = self.LayerNorm(rel_embeddings) - return rel_embeddings - - def get_attention_mask(self, attention_mask): - if attention_mask.dim() <= 2: - extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) - attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1) - elif attention_mask.dim() == 3: - attention_mask = attention_mask.unsqueeze(1) - - return attention_mask - - def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None): - if self.relative_attention and relative_pos is None: - q = query_states.size(-2) if query_states is not None else hidden_states.size(-2) - relative_pos = build_relative_position( - q, - hidden_states.size(-2), - bucket_size=self.position_buckets, - max_position=self.max_relative_positions, - device=hidden_states.device, - ) - return relative_pos - - def forward( - self, - hidden_states, - attention_mask, - output_hidden_states=True, - output_attentions=False, - query_states=None, - relative_pos=None, - return_dict=True, - ): - if attention_mask.dim() <= 2: - input_mask = attention_mask - else: - input_mask = attention_mask.sum(-2) > 0 - attention_mask = self.get_attention_mask(attention_mask) - relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos) - - all_hidden_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - - if isinstance(hidden_states, Sequence): - next_kv = hidden_states[0] - else: - next_kv = hidden_states - rel_embeddings = self.get_rel_embedding() - output_states = next_kv - for i, layer_module in enumerate(self.layer): - if output_hidden_states: - all_hidden_states = all_hidden_states + (output_states,) - - if self.gradient_checkpointing and self.training: - output_states = self._gradient_checkpointing_func( - layer_module.__call__, - next_kv, - attention_mask, - query_states, - relative_pos, - rel_embeddings, - output_attentions, - ) - else: - output_states = layer_module( - next_kv, - attention_mask, - query_states=query_states, - relative_pos=relative_pos, - rel_embeddings=rel_embeddings, - output_attentions=output_attentions, - ) - - if output_attentions: - output_states, att_m = output_states - - if i == 0 and self.conv is not None: - output_states = self.conv(hidden_states, output_states, input_mask) - - if query_states is not None: - query_states = output_states - if isinstance(hidden_states, Sequence): - next_kv = hidden_states[i + 1] if i + 1 < len(self.layer) else None - else: - next_kv = output_states - - if output_attentions: - all_attentions = all_attentions + (att_m,) - - if output_hidden_states: - all_hidden_states = all_hidden_states + (output_states,) +# Copied from transformers.models.deberta.modeling_deberta.DebertaSelfOutput with DebertaLayerNorm->LayerNorm +class DebertaV2SelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) - if not return_dict: - return tuple(v for v in [output_states, all_hidden_states, all_attentions] if v is not None) - return BaseModelOutput( - last_hidden_state=output_states, hidden_states=all_hidden_states, attentions=all_attentions - ) + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states -def make_log_bucket_position(relative_pos, bucket_size, max_position): +@torch.jit.script +def make_log_bucket_position(relative_pos, bucket_size: int, max_position: int): sign = torch.sign(relative_pos) mid = bucket_size // 2 abs_pos = torch.where( @@ -554,7 +75,7 @@ def make_log_bucket_position(relative_pos, bucket_size, max_position): return bucket_pos -def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-1, device=None): +def build_relative_position(query_layer, key_layer, bucket_size: int = -1, max_position: int = -1): """ Build relative position according to the query and key @@ -572,9 +93,11 @@ def build_relative_position(query_size, key_size, bucket_size=-1, max_position=- Return: `torch.LongTensor`: A tensor with shape [1, query_size, key_size] """ + query_size = query_layer.size(-2) + key_size = key_layer.size(-2) - q_ids = torch.arange(0, query_size, device=device) - k_ids = torch.arange(0, key_size, device=device) + q_ids = torch.arange(query_size, dtype=torch.long, device=query_layer.device) + k_ids = torch.arange(key_size, dtype=torch.long, device=key_layer.device) rel_pos_ids = q_ids[:, None] - k_ids[None, :] if bucket_size > 0 and max_position > 0: rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, max_position) @@ -602,6 +125,24 @@ def pos_dynamic_expand(pos_index, p2c_att, key_layer): return pos_index.expand(p2c_att.size()[:2] + (pos_index.size(-2), key_layer.size(-2))) +@torch.jit.script +def scaled_size_sqrt(query_layer: torch.Tensor, scale_factor: int): + return torch.sqrt(torch.tensor(query_layer.size(-1), dtype=torch.float) * scale_factor) + + +@torch.jit.script +def build_rpos(query_layer, key_layer, relative_pos, position_buckets: int, max_relative_positions: int): + if key_layer.size(-2) != query_layer.size(-2): + return build_relative_position( + key_layer, + key_layer, + bucket_size=position_buckets, + max_position=max_relative_positions, + ) + else: + return relative_pos + + class DisentangledSelfAttention(nn.Module): """ Disentangled self-attention module @@ -641,7 +182,7 @@ def __init__(self, config): if self.position_buckets > 0: self.pos_ebd_size = self.position_buckets - self.pos_dropout = StableDropout(config.hidden_dropout_prob) + self.pos_dropout = nn.Dropout(config.hidden_dropout_prob) if not self.share_att_key: if "c2p" in self.pos_att_type: @@ -649,9 +190,9 @@ def __init__(self, config): if "p2c" in self.pos_att_type: self.pos_query_proj = nn.Linear(config.hidden_size, self.all_head_size) - self.dropout = StableDropout(config.attention_probs_dropout_prob) + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - def transpose_for_scores(self, x, attention_heads): + def transpose_for_scores(self, x, attention_heads) -> torch.Tensor: new_x_shape = x.size()[:-1] + (attention_heads, -1) x = x.view(new_x_shape) return x.permute(0, 2, 1, 3).contiguous().view(-1, x.size(1), x.size(-1)) @@ -707,7 +248,7 @@ def forward( scale_factor += 1 if "p2c" in self.pos_att_type: scale_factor += 1 - scale = torch.sqrt(torch.tensor(query_layer.size(-1), dtype=torch.float) * scale_factor) + scale = scaled_size_sqrt(query_layer, scale_factor) attention_scores = torch.bmm(query_layer, key_layer.transpose(-1, -2) / scale.to(dtype=query_layer.dtype)) if self.relative_attention: rel_embeddings = self.pos_dropout(rel_embeddings) @@ -722,8 +263,12 @@ def forward( -1, self.num_attention_heads, attention_scores.size(-2), attention_scores.size(-1) ) + attention_mask = attention_mask.bool() + attention_scores = attention_scores.masked_fill(~(attention_mask), torch.finfo(query_layer.dtype).min) # bsz x height x length x dimension - attention_probs = XSoftmax.apply(attention_scores, attention_mask, -1) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + attention_probs.masked_fill(attention_mask, 0) + attention_probs = self.dropout(attention_probs) context_layer = torch.bmm( attention_probs.view(-1, attention_probs.size(-2), attention_probs.size(-1)), value_layer @@ -735,20 +280,17 @@ def forward( ) new_context_layer_shape = context_layer.size()[:-2] + (-1,) context_layer = context_layer.view(new_context_layer_shape) - if output_attentions: - return (context_layer, attention_probs) - else: - return context_layer + if not output_attentions: + return (context_layer, None) + return (context_layer, attention_probs) def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor): if relative_pos is None: - q = query_layer.size(-2) relative_pos = build_relative_position( - q, - key_layer.size(-2), + query_layer, + key_layer, bucket_size=self.position_buckets, max_position=self.max_relative_positions, - device=query_layer.device, ) if relative_pos.dim() == 2: relative_pos = relative_pos.unsqueeze(0).unsqueeze(0) @@ -782,7 +324,7 @@ def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_ score = 0 # content->position if "c2p" in self.pos_att_type: - scale = torch.sqrt(torch.tensor(pos_key_layer.size(-1), dtype=torch.float) * scale_factor) + scale = scaled_size_sqrt(pos_key_layer, scale_factor) c2p_att = torch.bmm(query_layer, pos_key_layer.transpose(-1, -2)) c2p_pos = torch.clamp(relative_pos + att_span, 0, att_span * 2 - 1) c2p_att = torch.gather( @@ -794,19 +336,14 @@ def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_ # position->content if "p2c" in self.pos_att_type: - scale = torch.sqrt(torch.tensor(pos_query_layer.size(-1), dtype=torch.float) * scale_factor) - if key_layer.size(-2) != query_layer.size(-2): - r_pos = build_relative_position( - key_layer.size(-2), - key_layer.size(-2), - bucket_size=self.position_buckets, - max_position=self.max_relative_positions, - device=query_layer.device, - ) - r_pos = r_pos.unsqueeze(0) - else: - r_pos = relative_pos - + scale = scaled_size_sqrt(pos_query_layer, scale_factor) + r_pos = build_rpos( + query_layer, + key_layer, + relative_pos, + self.max_relative_positions, + self.position_buckets, + ) p2c_pos = torch.clamp(-r_pos + att_span, 0, att_span * 2 - 1) p2c_att = torch.bmm(key_layer, pos_query_layer.transpose(-1, -2)) p2c_att = torch.gather( @@ -819,7 +356,144 @@ def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_ return score -# Copied from transformers.models.deberta.modeling_deberta.DebertaEmbeddings with DebertaLayerNorm->LayerNorm +# Copied from transformers.models.deberta.modeling_deberta.DebertaAttention with Deberta->DebertaV2 +class DebertaV2Attention(nn.Module): + def __init__(self, config): + super().__init__() + self.self = DisentangledSelfAttention(config) + self.output = DebertaV2SelfOutput(config) + self.config = config + + def forward( + self, + hidden_states, + attention_mask, + output_attentions: bool = False, + query_states=None, + relative_pos=None, + rel_embeddings=None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + self_output, att_matrix = self.self( + hidden_states, + attention_mask, + output_attentions, + query_states=query_states, + relative_pos=relative_pos, + rel_embeddings=rel_embeddings, + ) + if query_states is None: + query_states = hidden_states + attention_output = self.output(self_output, query_states) + + if output_attentions: + return (attention_output, att_matrix) + else: + return (attention_output, None) + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->DebertaV2 +class DebertaV2Intermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.deberta.modeling_deberta.DebertaOutput with DebertaLayerNorm->LayerNorm +class DebertaV2Output(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.config = config + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.deberta.modeling_deberta.DebertaLayer with Deberta->DebertaV2 +class DebertaV2Layer(nn.Module): + def __init__(self, config): + super().__init__() + self.attention = DebertaV2Attention(config) + self.intermediate = DebertaV2Intermediate(config) + self.output = DebertaV2Output(config) + + def forward( + self, + hidden_states, + attention_mask, + query_states=None, + relative_pos=None, + rel_embeddings=None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + attention_output, att_matrix = self.attention( + hidden_states, + attention_mask, + output_attentions=output_attentions, + query_states=query_states, + relative_pos=relative_pos, + rel_embeddings=rel_embeddings, + ) + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + + if output_attentions: + return (layer_output, att_matrix) + else: + return (layer_output, None) + + +class ConvLayer(nn.Module): + def __init__(self, config): + super().__init__() + kernel_size = getattr(config, "conv_kernel_size", 3) + groups = getattr(config, "conv_groups", 1) + self.conv_act = getattr(config, "conv_act", "tanh") + self.conv = nn.Conv1d( + config.hidden_size, config.hidden_size, kernel_size, padding=(kernel_size - 1) // 2, groups=groups + ) + self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.config = config + + def forward(self, hidden_states, residual_states, input_mask): + out = self.conv(hidden_states.permute(0, 2, 1).contiguous()).permute(0, 2, 1).contiguous() + rmask = (1 - input_mask).bool() + out.masked_fill_(rmask.unsqueeze(-1).expand(out.size()), 0) + out = ACT2FN[self.conv_act](self.dropout(out)) + + layer_norm_input = residual_states + out + output = self.LayerNorm(layer_norm_input).to(layer_norm_input) + + if input_mask is None: + output_states = output + else: + if input_mask.dim() != layer_norm_input.dim(): + if input_mask.dim() == 4: + input_mask = input_mask.squeeze(1).squeeze(1) + input_mask = input_mask.unsqueeze(2) + + input_mask = input_mask.to(output.dtype) + output_states = output * input_mask + + return output_states + + +# Copied from transformers.models.deberta.modeling_deberta.DebertaEmbeddings with DebertaLayerNorm->LayerNorm,Deberta->DebertaV2 class DebertaV2Embeddings(nn.Module): """Construct the embeddings from word, position and token_type embeddings.""" @@ -837,63 +511,197 @@ def __init__(self, config): if config.type_vocab_size > 0: self.token_type_embeddings = nn.Embedding(config.type_vocab_size, self.embedding_size) + else: + self.token_type_embeddings = None if self.embedding_size != config.hidden_size: self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias=False) + else: + self.embed_proj = None + self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) - self.dropout = StableDropout(config.hidden_dropout_prob) + self.dropout = nn.Dropout(config.hidden_dropout_prob) self.config = config - # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer( - "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False - ) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + + def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + if self.position_embeddings is not None: + position_embeddings = self.position_embeddings(position_ids.long()) + else: + position_embeddings = torch.zeros_like(inputs_embeds) + + embeddings = inputs_embeds + if self.position_biased_input: + embeddings += position_embeddings + if self.token_type_embeddings is not None: + token_type_embeddings = self.token_type_embeddings(token_type_ids) + embeddings += token_type_embeddings + + if self.embed_proj is not None: + embeddings = self.embed_proj(embeddings) + + embeddings = self.LayerNorm(embeddings) + + if mask is not None: + if mask.dim() != embeddings.dim(): + if mask.dim() == 4: + mask = mask.squeeze(1).squeeze(1) + mask = mask.unsqueeze(2) + mask = mask.to(embeddings.dtype) + + embeddings = embeddings * mask + + embeddings = self.dropout(embeddings) + return embeddings + + +class DebertaV2Encoder(nn.Module): + """Modified BertEncoder with relative position bias support""" + + def __init__(self, config): + super().__init__() + + self.layer = nn.ModuleList([DebertaV2Layer(config) for _ in range(config.num_hidden_layers)]) + self.relative_attention = getattr(config, "relative_attention", False) + + if self.relative_attention: + self.max_relative_positions = getattr(config, "max_relative_positions", -1) + if self.max_relative_positions < 1: + self.max_relative_positions = config.max_position_embeddings + + self.position_buckets = getattr(config, "position_buckets", -1) + pos_ebd_size = self.max_relative_positions * 2 + + if self.position_buckets > 0: + pos_ebd_size = self.position_buckets * 2 + + self.rel_embeddings = nn.Embedding(pos_ebd_size, config.hidden_size) + + self.norm_rel_ebd = [x.strip() for x in getattr(config, "norm_rel_ebd", "none").lower().split("|")] + + if "layer_norm" in self.norm_rel_ebd: + self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=True) - def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None): - if input_ids is not None: - input_shape = input_ids.size() - else: - input_shape = inputs_embeds.size()[:-1] + self.conv = ConvLayer(config) if getattr(config, "conv_kernel_size", 0) > 0 else None + self.gradient_checkpointing = False - seq_length = input_shape[1] + def get_rel_embedding(self): + rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None + if rel_embeddings is not None and ("layer_norm" in self.norm_rel_ebd): + rel_embeddings = self.LayerNorm(rel_embeddings) + return rel_embeddings - if position_ids is None: - position_ids = self.position_ids[:, :seq_length] + def get_attention_mask(self, attention_mask): + if attention_mask.dim() <= 2: + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1) + elif attention_mask.dim() == 3: + attention_mask = attention_mask.unsqueeze(1) - if token_type_ids is None: - token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + return attention_mask - if inputs_embeds is None: - inputs_embeds = self.word_embeddings(input_ids) + def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None): + if self.relative_attention and relative_pos is None: + if query_states is not None: + relative_pos = build_relative_position( + query_states, + hidden_states, + bucket_size=self.position_buckets, + max_position=self.max_relative_positions, + ) + else: + relative_pos = build_relative_position( + hidden_states, + hidden_states, + bucket_size=self.position_buckets, + max_position=self.max_relative_positions, + ) + return relative_pos - if self.position_embeddings is not None: - position_embeddings = self.position_embeddings(position_ids.long()) + def forward( + self, + hidden_states, + attention_mask, + output_hidden_states=True, + output_attentions=False, + query_states=None, + relative_pos=None, + return_dict=True, + ): + if attention_mask.dim() <= 2: + input_mask = attention_mask else: - position_embeddings = torch.zeros_like(inputs_embeds) + input_mask = attention_mask.sum(-2) > 0 + attention_mask = self.get_attention_mask(attention_mask) + relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos) - embeddings = inputs_embeds - if self.position_biased_input: - embeddings += position_embeddings - if self.config.type_vocab_size > 0: - token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings += token_type_embeddings + all_hidden_states: Optional[Tuple[torch.Tensor]] = (hidden_states,) if output_hidden_states else None + all_attentions = () if output_attentions else None - if self.embedding_size != self.config.hidden_size: - embeddings = self.embed_proj(embeddings) + next_kv = hidden_states + rel_embeddings = self.get_rel_embedding() + for i, layer_module in enumerate(self.layer): + if self.gradient_checkpointing and self.training: + output_states, attn_weights = self._gradient_checkpointing_func( + layer_module.__call__, + next_kv, + attention_mask, + query_states, + relative_pos, + rel_embeddings, + output_attentions, + ) + else: + output_states, attn_weights = layer_module( + next_kv, + attention_mask, + query_states=query_states, + relative_pos=relative_pos, + rel_embeddings=rel_embeddings, + output_attentions=output_attentions, + ) - embeddings = self.LayerNorm(embeddings) + if output_attentions: + all_attentions = all_attentions + (attn_weights,) - if mask is not None: - if mask.dim() != embeddings.dim(): - if mask.dim() == 4: - mask = mask.squeeze(1).squeeze(1) - mask = mask.unsqueeze(2) - mask = mask.to(embeddings.dtype) + if i == 0 and self.conv is not None: + output_states = self.conv(hidden_states, output_states, input_mask) - embeddings = embeddings * mask + if output_hidden_states: + all_hidden_states = all_hidden_states + (output_states,) - embeddings = self.dropout(embeddings) - return embeddings + if query_states is not None: + query_states = output_states + if isinstance(hidden_states, Sequence): + next_kv = hidden_states[i + 1] if i + 1 < len(self.layer) else None + else: + next_kv = output_states + + if not return_dict: + return tuple(v for v in [output_states, all_hidden_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=output_states, hidden_states=all_hidden_states, attentions=all_attentions + ) # Copied from transformers.models.deberta.modeling_deberta.DebertaPreTrainedModel with Deberta->DebertaV2 @@ -1099,25 +907,126 @@ def forward( ) +# Copied from transformers.models.deberta.modeling_deberta.LegacyDebertaPredictionHeadTransform with Deberta->DebertaV2 +class LegacyDebertaV2PredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.embedding_size = getattr(config, "embedding_size", config.hidden_size) + + self.dense = nn.Linear(config.hidden_size, self.embedding_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(self.embedding_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class LegacyDebertaV2LMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = LegacyDebertaV2PredictionHeadTransform(config) + + self.embedding_size = getattr(config, "embedding_size", config.hidden_size) + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(self.embedding_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def _tie_weights(self): + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class LegacyDebertaV2OnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = LegacyDebertaV2LMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class DebertaV2LMPredictionHead(nn.Module): + """https://github.com/microsoft/DeBERTa/blob/master/DeBERTa/deberta/bert.py#L270""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=True) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # note that the input embeddings must be passed as an argument + def forward(self, hidden_states, word_embeddings): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + hidden_states = torch.matmul(hidden_states, word_embeddings.weight.t()) + self.bias + return hidden_states + + +class DebertaV2OnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.lm_head = DebertaV2LMPredictionHead(config) + + # note that the input embeddings must be passed as an argument + def forward(self, sequence_output, word_embeddings): + prediction_scores = self.lm_head(sequence_output, word_embeddings) + return prediction_scores + + @add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING) class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel): _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] + _keys_to_ignore_on_load_unexpected = r"mask_predictions.*" def __init__(self, config): super().__init__(config) - + self.legacy = config.legacy self.deberta = DebertaV2Model(config) - self.cls = DebertaV2OnlyMLMHead(config) - + if self.legacy: + self.cls = LegacyDebertaV2OnlyMLMHead(config) + else: + self._tied_weights_keys = ["lm_predictions.lm_head.weight", "deberta.embeddings.word_embeddings.weight"] + self.lm_predictions = DebertaV2OnlyMLMHead(config) # Initialize weights and apply final processing self.post_init() def get_output_embeddings(self): - return self.cls.predictions.decoder + if self.legacy: + return self.cls.predictions.decoder + else: + return self.lm_predictions.lm_head.dense def set_output_embeddings(self, new_embeddings): - self.cls.predictions.decoder = new_embeddings - self.cls.predictions.bias = new_embeddings.bias + if self.legacy: + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + else: + self.lm_predictions.lm_head.dense = new_embeddings + self.lm_predictions.lm_head.bias = new_embeddings.bias @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) @add_code_sample_docstrings( @@ -1160,7 +1069,10 @@ def forward( ) sequence_output = outputs[0] - prediction_scores = self.cls(sequence_output) + if self.legacy: + prediction_scores = self.cls(sequence_output) + else: + prediction_scores = self.lm_predictions(sequence_output, self.deberta.embeddings.word_embeddings) masked_lm_loss = None if labels is not None: @@ -1179,60 +1091,27 @@ def forward( ) -# Copied from transformers.models.deberta.modeling_deberta.DebertaPredictionHeadTransform with Deberta->DebertaV2 -class DebertaV2PredictionHeadTransform(nn.Module): - def __init__(self, config): - super().__init__() - self.embedding_size = getattr(config, "embedding_size", config.hidden_size) - - self.dense = nn.Linear(config.hidden_size, self.embedding_size) - if isinstance(config.hidden_act, str): - self.transform_act_fn = ACT2FN[config.hidden_act] - else: - self.transform_act_fn = config.hidden_act - self.LayerNorm = nn.LayerNorm(self.embedding_size, eps=config.layer_norm_eps) - - def forward(self, hidden_states): - hidden_states = self.dense(hidden_states) - hidden_states = self.transform_act_fn(hidden_states) - hidden_states = self.LayerNorm(hidden_states) - return hidden_states - - -# Copied from transformers.models.deberta.modeling_deberta.DebertaLMPredictionHead with Deberta->DebertaV2 -class DebertaV2LMPredictionHead(nn.Module): +# Copied from transformers.models.deberta.modeling_deberta.ContextPooler +class ContextPooler(nn.Module): def __init__(self, config): super().__init__() - self.transform = DebertaV2PredictionHeadTransform(config) - - self.embedding_size = getattr(config, "embedding_size", config.hidden_size) - # The output weights are the same as the input embeddings, but there is - # an output-only bias for each token. - self.decoder = nn.Linear(self.embedding_size, config.vocab_size, bias=False) - - self.bias = nn.Parameter(torch.zeros(config.vocab_size)) - - # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` - self.decoder.bias = self.bias - - def _tie_weights(self): - self.decoder.bias = self.bias + self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size) + self.dropout = nn.Dropout(config.pooler_dropout) + self.config = config def forward(self, hidden_states): - hidden_states = self.transform(hidden_states) - hidden_states = self.decoder(hidden_states) - return hidden_states - + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. -# copied from transformers.models.bert.BertOnlyMLMHead with bert -> deberta -class DebertaV2OnlyMLMHead(nn.Module): - def __init__(self, config): - super().__init__() - self.predictions = DebertaV2LMPredictionHead(config) + context_token = hidden_states[:, 0] + context_token = self.dropout(context_token) + pooled_output = self.dense(context_token) + pooled_output = ACT2FN[self.config.pooler_hidden_act](pooled_output) + return pooled_output - def forward(self, sequence_output): - prediction_scores = self.predictions(sequence_output) - return prediction_scores + @property + def output_dim(self): + return self.config.hidden_size @add_start_docstrings( @@ -1256,7 +1135,7 @@ def __init__(self, config): self.classifier = nn.Linear(output_dim, num_labels) drop_out = getattr(config, "cls_dropout", None) drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out - self.dropout = StableDropout(drop_out) + self.dropout = nn.Dropout(drop_out) # Initialize weights and apply final processing self.post_init() @@ -1549,7 +1428,7 @@ def __init__(self, config): self.classifier = nn.Linear(output_dim, 1) drop_out = getattr(config, "cls_dropout", None) drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out - self.dropout = StableDropout(drop_out) + self.dropout = nn.Dropout(drop_out) self.init_weights() diff --git a/src/transformers/models/deformable_detr/__init__.py b/src/transformers/models/deformable_detr/__init__.py index ab44adf3718149..16a1959c30ff54 100644 --- a/src/transformers/models/deformable_detr/__init__.py +++ b/src/transformers/models/deformable_detr/__init__.py @@ -12,62 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import TYPE_CHECKING - -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available - -_import_structure = { - "configuration_deformable_detr": ["DeformableDetrConfig"], -} - -try: - if not is_vision_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["feature_extraction_deformable_detr"] = ["DeformableDetrFeatureExtractor"] - _import_structure["image_processing_deformable_detr"] = ["DeformableDetrImageProcessor"] +from typing import TYPE_CHECKING -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_deformable_detr"] = [ - "DeformableDetrForObjectDetection", - "DeformableDetrModel", - "DeformableDetrPreTrainedModel", - ] +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure if TYPE_CHECKING: - from .configuration_deformable_detr import DeformableDetrConfig - - try: - if not is_vision_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .feature_extraction_deformable_detr import DeformableDetrFeatureExtractor - from .image_processing_deformable_detr import DeformableDetrImageProcessor - - try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_deformable_detr import ( - DeformableDetrForObjectDetection, - DeformableDetrModel, - DeformableDetrPreTrainedModel, - ) - + from .configuration_deformable_detr import * + from .feature_extraction_deformable_detr import * + from .image_processing_deformable_detr import * + from .image_processing_deformable_detr_fast import * + from .modeling_deformable_detr import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/deformable_detr/configuration_deformable_detr.py b/src/transformers/models/deformable_detr/configuration_deformable_detr.py index 495e1154dad309..05bd0f906a1108 100644 --- a/src/transformers/models/deformable_detr/configuration_deformable_detr.py +++ b/src/transformers/models/deformable_detr/configuration_deformable_detr.py @@ -277,3 +277,6 @@ def num_attention_heads(self) -> int: @property def hidden_size(self) -> int: return self.d_model + + +__all__ = ["DeformableDetrConfig"] diff --git a/src/transformers/models/deformable_detr/feature_extraction_deformable_detr.py b/src/transformers/models/deformable_detr/feature_extraction_deformable_detr.py index f04743e91ceefe..de1676740647f6 100644 --- a/src/transformers/models/deformable_detr/feature_extraction_deformable_detr.py +++ b/src/transformers/models/deformable_detr/feature_extraction_deformable_detr.py @@ -41,3 +41,6 @@ def __init__(self, *args, **kwargs) -> None: FutureWarning, ) super().__init__(*args, **kwargs) + + +__all__ = ["DeformableDetrFeatureExtractor"] diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py index 8c149f554965a4..0b6a58eb8f1351 100644 --- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py +++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py @@ -1627,3 +1627,6 @@ def post_process_object_detection( results.append({"scores": score, "labels": label, "boxes": box}) return results + + +__all__ = ["DeformableDetrImageProcessor"] diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py new file mode 100644 index 00000000000000..0a2fbc14ee94f8 --- /dev/null +++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py @@ -0,0 +1,1060 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for Deformable DETR.""" + +import functools +import pathlib +from typing import Any, Dict, List, Optional, Tuple, Union + +from ...image_processing_utils import BatchFeature, get_size_dict +from ...image_processing_utils_fast import ( + BaseImageProcessorFast, + SizeDict, + get_image_size_for_max_height_width, + get_max_height_width, + safe_squeeze, +) +from ...image_transforms import ( + center_to_corners_format, + corners_to_center_format, +) +from ...image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + AnnotationFormat, + AnnotationType, + ChannelDimension, + ImageInput, + ImageType, + PILImageResampling, + get_image_size, + get_image_type, + infer_channel_dimension_format, + make_list_of_images, + pil_torch_interpolation_mapping, + validate_annotations, + validate_kwargs, +) +from ...utils import ( + TensorType, + is_torch_available, + is_torchvision_available, + is_torchvision_v2_available, + is_vision_available, + logging, +) +from .image_processing_deformable_detr import ( + get_size_with_aspect_ratio, +) + + +if is_torch_available(): + import torch + +if is_torchvision_available(): + from torchvision.io import read_image + + if is_vision_available(): + from ...image_utils import pil_torch_interpolation_mapping + + if is_torchvision_v2_available(): + from torchvision.transforms.v2 import functional as F + else: + from torchvision.transforms import functional as F + + +logger = logging.get_logger(__name__) + +SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) + + +# Copied from transformers.models.detr.image_processing_detr_fast.convert_coco_poly_to_mask +def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor: + """ + Convert a COCO polygon annotation to a mask. + + Args: + segmentations (`List[List[float]]`): + List of polygons, each polygon represented by a list of x-y coordinates. + height (`int`): + Height of the mask. + width (`int`): + Width of the mask. + """ + try: + from pycocotools import mask as coco_mask + except ImportError: + raise ImportError("Pycocotools is not installed in your environment.") + + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = torch.as_tensor(mask, dtype=torch.uint8, device=device) + mask = torch.any(mask, axis=2) + masks.append(mask) + if masks: + masks = torch.stack(masks, axis=0) + else: + masks = torch.zeros((0, height, width), dtype=torch.uint8, device=device) + + return masks + + +# Copied from transformers.models.detr.image_processing_detr_fast.prepare_coco_detection_annotation with DETR->DeformableDetr +def prepare_coco_detection_annotation( + image, + target, + return_segmentation_masks: bool = False, + input_data_format: Optional[Union[ChannelDimension, str]] = None, +): + """ + Convert the target in COCO format into the format expected by DeformableDetr. + """ + image_height, image_width = image.size()[-2:] + + image_id = target["image_id"] + image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device) + + # Get all COCO annotations for the given image. + annotations = target["annotations"] + classes = [] + area = [] + boxes = [] + keypoints = [] + for obj in annotations: + if "iscrowd" not in obj or obj["iscrowd"] == 0: + classes.append(obj["category_id"]) + area.append(obj["area"]) + boxes.append(obj["bbox"]) + if "keypoints" in obj: + keypoints.append(obj["keypoints"]) + + classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device) + area = torch.as_tensor(area, dtype=torch.float32, device=image.device) + iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device) + # guard against no boxes via resizing + boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width) + boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + + new_target = { + "image_id": image_id, + "class_labels": classes[keep], + "boxes": boxes[keep], + "area": area[keep], + "iscrowd": iscrowd[keep], + "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device), + } + + if keypoints: + keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device) + # Apply the keep mask here to filter the relevant annotations + keypoints = keypoints[keep] + num_keypoints = keypoints.shape[0] + keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints + new_target["keypoints"] = keypoints + + if return_segmentation_masks: + segmentation_masks = [obj["segmentation"] for obj in annotations] + masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width, device=image.device) + new_target["masks"] = masks[keep] + + return new_target + + +# Copied from transformers.models.detr.image_processing_detr_fast.masks_to_boxes +def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor: + """ + Compute the bounding boxes around the provided panoptic segmentation masks. + + Args: + masks: masks in format `[number_masks, height, width]` where N is the number of masks + + Returns: + boxes: bounding boxes in format `[number_masks, 4]` in xyxy format + """ + if masks.numel() == 0: + return torch.zeros((0, 4), device=masks.device) + + h, w = masks.shape[-2:] + y = torch.arange(0, h, dtype=torch.float32, device=masks.device) + x = torch.arange(0, w, dtype=torch.float32, device=masks.device) + # see https://github.com/pytorch/pytorch/issues/50276 + y, x = torch.meshgrid(y, x, indexing="ij") + + x_mask = masks * torch.unsqueeze(x, 0) + x_max = x_mask.view(x_mask.shape[0], -1).max(-1)[0] + x_min = ( + torch.where(masks, x.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0] + ) + + y_mask = masks * torch.unsqueeze(y, 0) + y_max = y_mask.view(y_mask.shape[0], -1).max(-1)[0] + y_min = ( + torch.where(masks, y.unsqueeze(0), torch.tensor(1e8, device=masks.device)).view(masks.shape[0], -1).min(-1)[0] + ) + + return torch.stack([x_min, y_min, x_max, y_max], 1) + + +# Copied from transformers.models.detr.image_processing_detr_fast.rgb_to_id +def rgb_to_id(color): + """ + Converts RGB color to unique ID. + """ + if isinstance(color, torch.Tensor) and len(color.shape) == 3: + if color.dtype == torch.uint8: + color = color.to(torch.int32) + return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2] + return int(color[0] + 256 * color[1] + 256 * 256 * color[2]) + + +# Copied from transformers.models.detr.image_processing_detr_fast.prepare_coco_panoptic_annotation with DETR->DeformableDetr +def prepare_coco_panoptic_annotation( + image: torch.Tensor, + target: Dict, + masks_path: Union[str, pathlib.Path], + return_masks: bool = True, + input_data_format: Union[ChannelDimension, str] = None, +) -> Dict: + """ + Prepare a coco panoptic annotation for DeformableDetr. + """ + image_height, image_width = get_image_size(image, channel_dim=input_data_format) + annotation_path = pathlib.Path(masks_path) / target["file_name"] + + new_target = {} + new_target["image_id"] = torch.as_tensor( + [target["image_id"] if "image_id" in target else target["id"]], dtype=torch.int64, device=image.device + ) + new_target["size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device) + new_target["orig_size"] = torch.as_tensor([image_height, image_width], dtype=torch.int64, device=image.device) + + if "segments_info" in target: + masks = read_image(annotation_path).permute(1, 2, 0).to(torch.int32).to(image.device) + masks = rgb_to_id(masks) + + ids = torch.as_tensor([segment_info["id"] for segment_info in target["segments_info"]], device=image.device) + masks = masks == ids[:, None, None] + masks = masks.to(torch.bool) + if return_masks: + new_target["masks"] = masks + new_target["boxes"] = masks_to_boxes(masks) + new_target["class_labels"] = torch.as_tensor( + [segment_info["category_id"] for segment_info in target["segments_info"]], + dtype=torch.int64, + device=image.device, + ) + new_target["iscrowd"] = torch.as_tensor( + [segment_info["iscrowd"] for segment_info in target["segments_info"]], + dtype=torch.int64, + device=image.device, + ) + new_target["area"] = torch.as_tensor( + [segment_info["area"] for segment_info in target["segments_info"]], + dtype=torch.float32, + device=image.device, + ) + + return new_target + + +class DeformableDetrImageProcessorFast(BaseImageProcessorFast): + r""" + Constructs a fast Deformable DETR image processor. + + Args: + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_resize (`bool`, *optional*, defaults to `True`): + Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be + overridden by the `do_resize` parameter in the `preprocess` method. + size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): + Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter + in the `preprocess` method. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. + do_rescale (`bool`, *optional*, defaults to `True`): + Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the + `preprocess` method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`): + Mean values to use when normalizing the image. Can be a single value or a list of values, one for each + channel. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`): + Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one + for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the DETR model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + do_pad (`bool`, *optional*, defaults to `True`): + Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + + model_input_names = ["pixel_values", "pixel_mask"] + + # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.__init__ + def __init__( + self, + format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: Union[PILImageResampling, "F.InterpolationMode"] = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Union[float, List[float]] = None, + image_std: Union[float, List[float]] = None, + do_convert_annotations: Optional[bool] = None, + do_pad: bool = True, + pad_size: Optional[Dict[str, int]] = None, + **kwargs, + ) -> None: + if "pad_and_return_pixel_mask" in kwargs: + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` parameter is deprecated and will be removed in v4.26. " + "Please specify in `size['longest_edge'] instead`.", + ) + max_size = kwargs.pop("max_size") + else: + max_size = None if size is None else 1333 + + size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} + size = get_size_dict(size, max_size=max_size, default_to_square=False) + + # Backwards compatibility + if do_convert_annotations is None: + do_convert_annotations = do_normalize + + super().__init__(**kwargs) + self.format = format + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.do_convert_annotations = do_convert_annotations + self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD + self.do_pad = do_pad + self.pad_size = pad_size + self._valid_processor_keys = [ + "images", + "annotations", + "return_segmentation_masks", + "masks_path", + "do_resize", + "size", + "resample", + "do_rescale", + "rescale_factor", + "do_normalize", + "do_convert_annotations", + "image_mean", + "image_std", + "do_pad", + "pad_size", + "format", + "return_tensors", + "data_format", + "input_data_format", + ] + + @classmethod + # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.from_dict with Detr->DeformableDetr + def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): + """ + Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is + created using from_dict and kwargs e.g. `DeformableDetrImageProcessorFast.from_pretrained(checkpoint, size=600, + max_size=800)` + """ + image_processor_dict = image_processor_dict.copy() + if "max_size" in kwargs: + image_processor_dict["max_size"] = kwargs.pop("max_size") + if "pad_and_return_pixel_mask" in kwargs: + image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") + return super().from_dict(image_processor_dict, **kwargs) + + # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.prepare_annotation with DETR->DeformableDetr + def prepare_annotation( + self, + image: torch.Tensor, + target: Dict, + format: Optional[AnnotationFormat] = None, + return_segmentation_masks: bool = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> Dict: + """ + Prepare an annotation for feeding into DeformableDetr model. + """ + format = format if format is not None else self.format + + if format == AnnotationFormat.COCO_DETECTION: + return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_detection_annotation( + image, target, return_segmentation_masks, input_data_format=input_data_format + ) + elif format == AnnotationFormat.COCO_PANOPTIC: + return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_panoptic_annotation( + image, + target, + masks_path=masks_path, + return_masks=return_segmentation_masks, + input_data_format=input_data_format, + ) + else: + raise ValueError(f"Format {format} is not supported.") + return target + + # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.resize + def resize( + self, + image: torch.Tensor, + size: SizeDict, + interpolation: "F.InterpolationMode" = None, + **kwargs, + ) -> torch.Tensor: + """ + Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an + int, smaller edge of the image will be matched to this number. + + Args: + image (`torch.Tensor`): + Image to resize. + size (`SizeDict`): + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): + Resampling filter to use if resizing the image. + """ + interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR + if size.shortest_edge and size.longest_edge: + # Resize the image so that the shortest edge or the longest edge is of the given size + # while maintaining the aspect ratio of the original image. + new_size = get_size_with_aspect_ratio( + image.size()[-2:], + size["shortest_edge"], + size["longest_edge"], + ) + elif size.max_height and size.max_width: + new_size = get_image_size_for_max_height_width(image.size()[-2:], size["max_height"], size["max_width"]) + elif size.height and size.width: + new_size = (size["height"], size["width"]) + else: + raise ValueError( + "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" + f" {size.keys()}." + ) + + image = F.resize( + image, + size=new_size, + interpolation=interpolation, + **kwargs, + ) + return image + + # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.resize_annotation + def resize_annotation( + self, + annotation: Dict[str, Any], + orig_size: Tuple[int, int], + target_size: Tuple[int, int], + threshold: float = 0.5, + interpolation: "F.InterpolationMode" = None, + ): + """ + Resizes an annotation to a target size. + + Args: + annotation (`Dict[str, Any]`): + The annotation dictionary. + orig_size (`Tuple[int, int]`): + The original size of the input image. + target_size (`Tuple[int, int]`): + The target size of the image, as returned by the preprocessing `resize` step. + threshold (`float`, *optional*, defaults to 0.5): + The threshold used to binarize the segmentation masks. + resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST`): + The resampling filter to use when resizing the masks. + """ + interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST + ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)] + + new_annotation = {} + new_annotation["size"] = target_size + + for key, value in annotation.items(): + if key == "boxes": + boxes = value + scaled_boxes = boxes * torch.as_tensor( + [ratio_width, ratio_height, ratio_width, ratio_height], dtype=torch.float32, device=boxes.device + ) + new_annotation["boxes"] = scaled_boxes + elif key == "area": + area = value + scaled_area = area * (ratio_width * ratio_height) + new_annotation["area"] = scaled_area + elif key == "masks": + masks = value[:, None] + masks = [F.resize(mask, target_size, interpolation=interpolation) for mask in masks] + masks = torch.stack(masks).to(torch.float32) + masks = masks[:, 0] > threshold + new_annotation["masks"] = masks + elif key == "size": + new_annotation["size"] = target_size + else: + new_annotation[key] = value + + return new_annotation + + # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.normalize_annotation + def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict: + image_height, image_width = image_size + norm_annotation = {} + for key, value in annotation.items(): + if key == "boxes": + boxes = value + boxes = corners_to_center_format(boxes) + boxes /= torch.as_tensor( + [image_width, image_height, image_width, image_height], dtype=torch.float32, device=boxes.device + ) + norm_annotation[key] = boxes + else: + norm_annotation[key] = value + return norm_annotation + + # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast._update_annotation_for_padded_image + def _update_annotation_for_padded_image( + self, + annotation: Dict, + input_image_size: Tuple[int, int], + output_image_size: Tuple[int, int], + padding, + update_bboxes, + ) -> Dict: + """ + Update the annotation for a padded image. + """ + new_annotation = {} + new_annotation["size"] = output_image_size + ratio_height, ratio_width = (input / output for output, input in zip(output_image_size, input_image_size)) + + for key, value in annotation.items(): + if key == "masks": + masks = value + masks = F.pad( + masks, + padding, + fill=0, + ) + masks = safe_squeeze(masks, 1) + new_annotation["masks"] = masks + elif key == "boxes" and update_bboxes: + boxes = value + boxes *= torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height], device=boxes.device) + new_annotation["boxes"] = boxes + elif key == "size": + new_annotation["size"] = output_image_size + else: + new_annotation[key] = value + return new_annotation + + # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.pad + def pad( + self, + image: torch.Tensor, + padded_size: Tuple[int, int], + annotation: Optional[Dict[str, Any]] = None, + update_bboxes: bool = True, + fill: int = 0, + ): + original_size = image.size()[-2:] + padding_bottom = padded_size[0] - original_size[0] + padding_right = padded_size[1] - original_size[1] + if padding_bottom < 0 or padding_right < 0: + raise ValueError( + f"Padding dimensions are negative. Please make sure that the padded size is larger than the " + f"original size. Got padded size: {padded_size}, original size: {original_size}." + ) + if original_size != padded_size: + padding = [0, 0, padding_right, padding_bottom] + image = F.pad(image, padding, fill=fill) + if annotation is not None: + annotation = self._update_annotation_for_padded_image( + annotation, original_size, padded_size, padding, update_bboxes + ) + + # Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device) + pixel_mask[: original_size[0], : original_size[1]] = 1 + + return image, pixel_mask, annotation + + @functools.lru_cache(maxsize=1) + # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast._validate_input_arguments + def _validate_input_arguments( + self, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Union[float, List[float]], + image_std: Union[float, List[float]], + do_resize: bool, + size: Dict[str, int], + resample: "PILImageResampling", + data_format: Union[str, ChannelDimension], + return_tensors: Union[TensorType, str], + ): + if return_tensors != "pt": + raise ValueError("Only returning PyTorch tensors is currently supported.") + + if data_format != ChannelDimension.FIRST: + raise ValueError("Only channel first data format is currently supported.") + + if do_resize and None in (size, resample): + raise ValueError("Size and resample must be specified if do_resize is True.") + + if do_rescale and rescale_factor is None: + raise ValueError("Rescale factor must be specified if do_rescale is True.") + + if do_normalize and None in (image_mean, image_std): + raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.") + + # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.preprocess + def preprocess( + self, + images: ImageInput, + annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None, + return_segmentation_masks: bool = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample: Optional[Union[PILImageResampling, "F.InterpolationMode"]] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[Union[int, float]] = None, + do_normalize: Optional[bool] = None, + do_convert_annotations: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_pad: Optional[bool] = None, + format: Optional[Union[str, AnnotationFormat]] = None, + return_tensors: Optional[Union[TensorType, str]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_size: Optional[Dict[str, int]] = None, + **kwargs, + ) -> BatchFeature: + """ + Preprocess an image or a batch of images so that it can be used by the model. + + Args: + images (`ImageInput`): + Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging + from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`. + annotations (`AnnotationType` or `List[AnnotationType]`, *optional*): + List of annotations associated with the image or batch of images. If annotation is for object + detection, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a + dictionary. An image can have no annotations, in which case the list should be empty. + If annotation is for segmentation, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary. + An image can have no segments, in which case the list should be empty. + - "file_name" (`str`): The file name of the image. + return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks): + Whether to return segmentation masks. + masks_path (`str` or `pathlib.Path`, *optional*): + Path to the directory containing the segmentation masks. + do_resize (`bool`, *optional*, defaults to self.do_resize): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to self.size): + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. + resample (`PILImageResampling` or `InterpolationMode`, *optional*, defaults to self.resample): + Resampling filter to use when resizing the image. + do_rescale (`bool`, *optional*, defaults to self.do_rescale): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to self.rescale_factor): + Rescale factor to use when rescaling the image. + do_normalize (`bool`, *optional*, defaults to self.do_normalize): + Whether to normalize the image. + do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations): + Whether to convert the annotations to the format expected by the model. Converts the bounding + boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)` + and in relative coordinates. + image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean): + Mean to use when normalizing the image. + image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): + Standard deviation to use when normalizing the image. + do_pad (`bool`, *optional*, defaults to self.do_pad): + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified + dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. + format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): + Format of the annotations. + return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): + Type of tensors to return. If `None`, will return the list of images. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. + """ + if "pad_and_return_pixel_mask" in kwargs: + logger.warning_once( + "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " + "use `do_pad` instead." + ) + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` argument is deprecated and will be removed in a future version, use" + " `size['longest_edge']` instead." + ) + size = kwargs.pop("max_size") + do_resize = self.do_resize if do_resize is None else do_resize + size = self.size if size is None else size + size = get_size_dict(size=size, default_to_square=False) + resample = self.resample if resample is None else resample + do_rescale = self.do_rescale if do_rescale is None else do_rescale + rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor + do_normalize = self.do_normalize if do_normalize is None else do_normalize + image_mean = self.image_mean if image_mean is None else image_mean + image_std = self.image_std if image_std is None else image_std + do_convert_annotations = ( + self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations + ) + do_pad = self.do_pad if do_pad is None else do_pad + pad_size = self.pad_size if pad_size is None else pad_size + format = self.format if format is None else format + device = kwargs.pop("device", None) + + # Make hashable for cache + size = SizeDict(**size) + image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean + image_std = tuple(image_std) if isinstance(image_std, list) else image_std + + images = make_list_of_images(images) + image_type = get_image_type(images[0]) + + if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]: + raise ValueError(f"Unsupported input image type {image_type}") + validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys) + + self._validate_input_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + return_tensors=return_tensors, + data_format=data_format, + ) + + if annotations is not None and isinstance(annotations, dict): + annotations = [annotations] + + if annotations is not None and len(images) != len(annotations): + raise ValueError( + f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match." + ) + + format = AnnotationFormat(format) + if annotations is not None: + validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations) + + if ( + masks_path is not None + and format == AnnotationFormat.COCO_PANOPTIC + and not isinstance(masks_path, (pathlib.Path, str)) + ): + raise ValueError( + "The path to the directory containing the mask PNG files should be provided as a" + f" `pathlib.Path` or string object, but is {type(masks_path)} instead." + ) + + data = {} + if image_type == ImageType.PIL: + images = [F.pil_to_tensor(image) for image in images] + elif image_type == ImageType.NUMPY: + # not using F.to_tensor as it doesn't handle (C, H, W) numpy arrays + images = [torch.from_numpy(image).contiguous() for image in images] + + if device is not None: + images = [image.to(device) for image in images] + + # We assume that all images have the same channel dimension format. + if input_data_format is None: + input_data_format = infer_channel_dimension_format(images[0]) + if input_data_format == ChannelDimension.LAST: + images = [image.permute(2, 0, 1).contiguous() for image in images] + input_data_format = ChannelDimension.FIRST + + if do_rescale and do_normalize: + # fused rescale and normalize + new_mean = torch.tensor(image_mean, device=images[0].device) * (1.0 / rescale_factor) + new_std = torch.tensor(image_std, device=images[0].device) * (1.0 / rescale_factor) + + processed_images = [] + processed_annotations = [] + pixel_masks = [] # Initialize pixel_masks here + for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)): + # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) + if annotations is not None: + annotation = self.prepare_annotation( + image, + annotation, + format, + return_segmentation_masks=return_segmentation_masks, + masks_path=masks_path, + input_data_format=input_data_format, + ) + + if do_resize: + interpolation = ( + pil_torch_interpolation_mapping[resample] + if isinstance(resample, (PILImageResampling, int)) + else resample + ) + resized_image = self.resize(image, size=size, interpolation=interpolation) + if annotations is not None: + annotation = self.resize_annotation( + annotation, + orig_size=image.size()[-2:], + target_size=resized_image.size()[-2:], + ) + image = resized_image + + if do_rescale and do_normalize: + # fused rescale and normalize + image = F.normalize(image.to(dtype=torch.float32), new_mean, new_std) + elif do_rescale: + image = image * rescale_factor + elif do_normalize: + image = F.normalize(image, image_mean, image_std) + + if do_convert_annotations and annotations is not None: + annotation = self.normalize_annotation(annotation, get_image_size(image, input_data_format)) + + processed_images.append(image) + processed_annotations.append(annotation) + images = processed_images + annotations = processed_annotations if annotations is not None else None + + if do_pad: + # depends on all resized image shapes so we need another loop + if pad_size is not None: + padded_size = (pad_size["height"], pad_size["width"]) + else: + padded_size = get_max_height_width(images) + + padded_images = [] + padded_annotations = [] + for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)): + # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} + if padded_size == image.size()[-2:]: + padded_images.append(image) + pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device)) + padded_annotations.append(annotation) + continue + image, pixel_mask, annotation = self.pad( + image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations + ) + padded_images.append(image) + padded_annotations.append(annotation) + pixel_masks.append(pixel_mask) + images = padded_images + annotations = padded_annotations if annotations is not None else None + data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)}) + + data.update({"pixel_values": torch.stack(images, dim=0)}) + encoded_inputs = BatchFeature(data, tensor_type=return_tensors) + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations + ] + return encoded_inputs + + # Copied from transformers.models.deformable_detr.image_processing_deformable_detr.DeformableDetrImageProcessor.post_process + def post_process(self, outputs, target_sizes): + """ + Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x, + top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch. + + Args: + outputs ([`DeformableDetrObjectDetectionOutput`]): + Raw outputs of the model. + target_sizes (`torch.Tensor` of shape `(batch_size, 2)`): + Tensor containing the size (height, width) of each image of the batch. For evaluation, this must be the + original image size (before any data augmentation). For visualization, this should be the image size + after data augment, but before padding. + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + logger.warning_once( + "`post_process` is deprecated and will be removed in v5 of Transformers, please use" + " `post_process_object_detection` instead, with `threshold=0.` for equivalent results.", + ) + + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + + if len(out_logits) != len(target_sizes): + raise ValueError("Make sure that you pass in as many target sizes as the batch dimension of the logits") + if target_sizes.shape[1] != 2: + raise ValueError("Each element of target_sizes must contain the size (h, w) of each image of the batch") + + prob = out_logits.sigmoid() + topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), 100, dim=1) + scores = topk_values + topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") + labels = topk_indexes % out_logits.shape[2] + boxes = center_to_corners_format(out_bbox) + boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) + + # and from relative [0, 1] to absolute [0, height] coordinates + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) + boxes = boxes * scale_fct[:, None, :] + + results = [{"scores": s, "labels": l, "boxes": b} for s, l, b in zip(scores, labels, boxes)] + + return results + + # Copied from transformers.models.deformable_detr.image_processing_deformable_detr.DeformableDetrImageProcessor.post_process_object_detection + def post_process_object_detection( + self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100 + ): + """ + Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x, + top_left_y, bottom_right_x, bottom_right_y) format. Only supports PyTorch. + + Args: + outputs ([`DetrObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + (height, width) of each image in the batch. If left to None, predictions will not be resized. + top_k (`int`, *optional*, defaults to 100): + Keep only top k bounding boxes before filtering by thresholding. + + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + out_logits, out_bbox = outputs.logits, outputs.pred_boxes + + if target_sizes is not None: + if len(out_logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + + prob = out_logits.sigmoid() + prob = prob.view(out_logits.shape[0], -1) + k_value = min(top_k, prob.size(1)) + topk_values, topk_indexes = torch.topk(prob, k_value, dim=1) + scores = topk_values + topk_boxes = torch.div(topk_indexes, out_logits.shape[2], rounding_mode="floor") + labels = topk_indexes % out_logits.shape[2] + boxes = center_to_corners_format(out_bbox) + boxes = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4)) + + # and from relative [0, 1] to absolute [0, height] coordinates + if target_sizes is not None: + if isinstance(target_sizes, List): + img_h = torch.Tensor([i[0] for i in target_sizes]) + img_w = torch.Tensor([i[1] for i in target_sizes]) + else: + img_h, img_w = target_sizes.unbind(1) + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) + boxes = boxes * scale_fct[:, None, :] + + results = [] + for s, l, b in zip(scores, labels, boxes): + score = s[s > threshold] + label = l[s > threshold] + box = b[s > threshold] + results.append({"scores": score, "labels": label, "boxes": box}) + + return results + + +__all__ = ["DeformableDetrImageProcessorFast"] diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py index f47221e55ad8ad..3fd6914a6c62dd 100755 --- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -2064,3 +2064,10 @@ def forward( ) return dict_outputs + + +__all__ = [ + "DeformableDetrForObjectDetection", + "DeformableDetrModel", + "DeformableDetrPreTrainedModel", +] diff --git a/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py index cf191f797f72df..2c7b687c4d9897 100644 --- a/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py +++ b/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py @@ -109,7 +109,8 @@ def convert_transfo_xl_checkpoint_to_pytorch( "--transfo_xl_dataset_file", default="", type=str, - help="An optional dataset file to be converted in a vocabulary.", + help="An optional dataset file to be converted in a vocabulary.\n" + "Given the files are in the pickle format, please be wary of passing it files you trust.", ) args = parser.parse_args() convert_transfo_xl_checkpoint_to_pytorch( diff --git a/src/transformers/models/depth_anything/modeling_depth_anything.py b/src/transformers/models/depth_anything/modeling_depth_anything.py index 59c628786328e6..4667c413457b19 100644 --- a/src/transformers/models/depth_anything/modeling_depth_anything.py +++ b/src/transformers/models/depth_anything/modeling_depth_anything.py @@ -224,16 +224,16 @@ def forward(self, hidden_states, size=None): hidden_states = hidden_states[::-1] fused_hidden_states = [] - # first layer only uses the last hidden_state - size = hidden_states[1].shape[2:] - fused_hidden_state = self.layers[0](hidden_states[0], size=size) - fused_hidden_states.append(fused_hidden_state) + fused_hidden_state = None - # looping from the last layer to the second - for idx, (hidden_state, layer) in enumerate(zip(hidden_states[1:], self.layers[1:])): - size = hidden_states[1:][idx + 1].shape[2:] if idx != (len(hidden_states[1:]) - 1) else None + for idx, (hidden_state, layer) in enumerate(zip(hidden_states, self.layers)): + size = hidden_states[idx + 1].shape[2:] if idx != (len(hidden_states) - 1) else None - fused_hidden_state = layer(fused_hidden_state, hidden_state, size=size) + if fused_hidden_state is None: + # first layer only uses the last hidden_state + fused_hidden_state = layer(hidden_state, size=size) + else: + fused_hidden_state = layer(fused_hidden_state, hidden_state, size=size) fused_hidden_states.append(fused_hidden_state) diff --git a/src/transformers/models/detr/__init__.py b/src/transformers/models/detr/__init__.py index cc6687ff8bb4b4..4b8aae5e70381b 100644 --- a/src/transformers/models/detr/__init__.py +++ b/src/transformers/models/detr/__init__.py @@ -14,62 +14,18 @@ from typing import TYPE_CHECKING -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available - - -_import_structure = {"configuration_detr": ["DetrConfig", "DetrOnnxConfig"]} - -try: - if not is_vision_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["feature_extraction_detr"] = ["DetrFeatureExtractor"] - _import_structure["image_processing_detr"] = ["DetrImageProcessor"] - _import_structure["image_processing_detr_fast"] = ["DetrImageProcessorFast"] - -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_detr"] = [ - "DetrForObjectDetection", - "DetrForSegmentation", - "DetrModel", - "DetrPreTrainedModel", - ] +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure if TYPE_CHECKING: - from .configuration_detr import DetrConfig, DetrOnnxConfig - - try: - if not is_vision_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .feature_extraction_detr import DetrFeatureExtractor - from .image_processing_detr import DetrImageProcessor - from .image_processing_detr_fast import DetrImageProcessorFast - - try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_detr import ( - DetrForObjectDetection, - DetrForSegmentation, - DetrModel, - DetrPreTrainedModel, - ) - + from .configuration_detr import * + from .feature_extraction_detr import * + from .image_processing_detr import * + from .image_processing_detr_fast import * + from .modeling_detr import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py index 8b4a5b08dab2f6..90cd3b1345e357 100644 --- a/src/transformers/models/detr/configuration_detr.py +++ b/src/transformers/models/detr/configuration_detr.py @@ -284,3 +284,6 @@ def atol_for_validation(self) -> float: @property def default_onnx_opset(self) -> int: return 12 + + +__all__ = ["DetrConfig", "DetrOnnxConfig"] diff --git a/src/transformers/models/detr/feature_extraction_detr.py b/src/transformers/models/detr/feature_extraction_detr.py index 6ea33666466f9a..c2b29211099783 100644 --- a/src/transformers/models/detr/feature_extraction_detr.py +++ b/src/transformers/models/detr/feature_extraction_detr.py @@ -41,3 +41,6 @@ def __init__(self, *args, **kwargs) -> None: FutureWarning, ) super().__init__(*args, **kwargs) + + +__all__ = ["DetrFeatureExtractor"] diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py index 10d1b4d5d4a5c4..85fbe698d10a12 100644 --- a/src/transformers/models/detr/image_processing_detr.py +++ b/src/transformers/models/detr/image_processing_detr.py @@ -2042,3 +2042,6 @@ def post_process_panoptic_segmentation( results.append({"segmentation": segmentation, "segments_info": segments}) return results + + +__all__ = ["DetrImageProcessor"] diff --git a/src/transformers/models/detr/image_processing_detr_fast.py b/src/transformers/models/detr/image_processing_detr_fast.py index b414b4224e683c..f010ffe272294f 100644 --- a/src/transformers/models/detr/image_processing_detr_fast.py +++ b/src/transformers/models/detr/image_processing_detr_fast.py @@ -416,7 +416,7 @@ def __init__( def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): """ Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is - created using from_dict and kwargs e.g. `DetrImageProcessor.from_pretrained(checkpoint, size=600, + created using from_dict and kwargs e.g. `DetrImageProcessorFast.from_pretrained(checkpoint, size=600, max_size=800)` """ image_processor_dict = image_processor_dict.copy() @@ -863,6 +863,7 @@ def preprocess( input_data_format = infer_channel_dimension_format(images[0]) if input_data_format == ChannelDimension.LAST: images = [image.permute(2, 0, 1).contiguous() for image in images] + input_data_format = ChannelDimension.FIRST if do_rescale and do_normalize: # fused rescale and normalize @@ -1494,3 +1495,6 @@ def post_process_panoptic_segmentation( results.append({"segmentation": segmentation, "segments_info": segments}) return results + + +__all__ = ["DetrImageProcessorFast"] diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index f51362d94e1722..ac9a78671f8fea 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -1805,3 +1805,11 @@ def forward(self, q, k, mask: Optional[Tensor] = None): weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size()) weights = self.dropout(weights) return weights + + +__all__ = [ + "DetrForObjectDetection", + "DetrForSegmentation", + "DetrModel", + "DetrPreTrainedModel", +] diff --git a/src/transformers/models/dinov2/configuration_dinov2.py b/src/transformers/models/dinov2/configuration_dinov2.py index 2df883de1699de..9590cddc347fe9 100644 --- a/src/transformers/models/dinov2/configuration_dinov2.py +++ b/src/transformers/models/dinov2/configuration_dinov2.py @@ -60,7 +60,7 @@ class Dinov2Config(BackboneConfigMixin, PretrainedConfig): The epsilon used by the layer normalization layers. image_size (`int`, *optional*, defaults to 224): The size (resolution) of each image. - patch_size (`int`, *optional*, defaults to 16): + patch_size (`int`, *optional*, defaults to 14): The size (resolution) of each patch. num_channels (`int`, *optional*, defaults to 3): The number of input channels. @@ -118,7 +118,7 @@ def __init__( initializer_range=0.02, layer_norm_eps=1e-6, image_size=224, - patch_size=16, + patch_size=14, num_channels=3, qkv_bias=True, layerscale_value=1.0, diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py index 2d4654a234c2c6..5886d288b88271 100755 --- a/src/transformers/models/dpt/modeling_dpt.py +++ b/src/transformers/models/dpt/modeling_dpt.py @@ -689,12 +689,13 @@ def forward(self, hidden_states): hidden_states = hidden_states[::-1] fused_hidden_states = [] - # first layer only uses the last hidden_state - fused_hidden_state = self.layers[0](hidden_states[0]) - fused_hidden_states.append(fused_hidden_state) - # looping from the last layer to the second - for hidden_state, layer in zip(hidden_states[1:], self.layers[1:]): - fused_hidden_state = layer(fused_hidden_state, hidden_state) + fused_hidden_state = None + for hidden_state, layer in zip(hidden_states, self.layers): + if fused_hidden_state is None: + # first layer only uses the last hidden_state + fused_hidden_state = layer(hidden_state) + else: + fused_hidden_state = layer(fused_hidden_state, hidden_state) fused_hidden_states.append(fused_hidden_state) return fused_hidden_states diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 504dcf10b206c3..faea670ecbf428 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -1277,12 +1277,18 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.LongTensor] = None, + num_logits_to_keep: int = 0, ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + + num_logits_to_keep (`int`, *optional*): + Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all + `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that + token can save memory, which becomes pretty significant for long sequences or large vocabulary size. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict @@ -1302,7 +1308,7 @@ def forward( ) hidden_states = transformer_outputs[0] - lm_logits = self.lm_head(hidden_states) + lm_logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) loss = None if labels is not None: diff --git a/src/transformers/models/fuyu/modeling_fuyu.py b/src/transformers/models/fuyu/modeling_fuyu.py index c8c758e6888a59..2df5dbc8b29177 100644 --- a/src/transformers/models/fuyu/modeling_fuyu.py +++ b/src/transformers/models/fuyu/modeling_fuyu.py @@ -346,7 +346,7 @@ def prepare_inputs_for_generation( ): # Overwritten -- in specific circumstances we don't want to forward image inputs to the model - if past_key_values: + if past_key_values is not None: input_ids = input_ids[:, -1:] position_ids = kwargs.get("position_ids", None) @@ -355,7 +355,7 @@ def prepare_inputs_for_generation( position_ids = attention_mask.long().cumsum(-1) - 1 position_ids.masked_fill_(attention_mask == 0, 1) if past_key_values: - position_ids = position_ids[:, -1].unsqueeze(-1) + position_ids = position_ids[:, -1:] # if `inputs_embeds` are passed, we only want to use them in the 1st generation step if inputs_embeds is not None and past_key_values is None: @@ -377,3 +377,12 @@ def prepare_inputs_for_generation( } ) return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py index e170803cccab70..346f386ba698f2 100644 --- a/src/transformers/models/gemma/configuration_gemma.py +++ b/src/transformers/models/gemma/configuration_gemma.py @@ -20,7 +20,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - from ...configuration_utils import PretrainedConfig diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index fa3fadc4349a3c..52d02995016167 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -720,7 +720,10 @@ def __init__(self, config: GemmaConfig): [GemmaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.gradient_checkpointing = False + if getattr(config, "pretraining_tp", 1) != 1: + logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.") # Initialize weights and apply final processing self.post_init() @@ -805,7 +808,7 @@ def forward( all_self_attns = () if output_attentions else None next_decoder_cache = None - for decoder_layer in self.layers: + for decoder_layer in self.layers[: self.config.num_hidden_layers]: if output_hidden_states: all_hidden_states += (hidden_states,) @@ -982,6 +985,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( class GemmaForCausalLM(GemmaPreTrainedModel, GenerationMixin): _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py index 807f91ff9e6baa..ad1348ae5e3163 100644 --- a/src/transformers/models/gemma/modular_gemma.py +++ b/src/transformers/models/gemma/modular_gemma.py @@ -886,7 +886,7 @@ def forward( all_self_attns = () if output_attentions else None next_decoder_cache = None - for decoder_layer in self.layers: + for decoder_layer in self.layers[: self.config.num_hidden_layers]: if output_hidden_states: all_hidden_states += (hidden_states,) diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index 626e5537fc06bc..58836a5631c2c0 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -27,7 +27,6 @@ from ...activations import ACT2FN from ...cache_utils import Cache, HybridCache from ...generation import GenerationMixin -from ...modeling_flash_attention_utils import _flash_attention_forward from ...modeling_outputs import ( BaseModelOutputWithPast, CausalLMOutputWithPast, @@ -41,7 +40,7 @@ add_start_docstrings_to_model_forward, is_flash_attn_2_available, is_flash_attn_greater_or_equal, - is_flash_attn_greater_or_equal_2_10, + is_torch_greater_or_equal, logging, replace_return_docstrings, ) @@ -51,6 +50,8 @@ if is_flash_attn_2_available(): from ...modeling_flash_attention_utils import _flash_attention_forward +if is_torch_greater_or_equal("2.5"): + from torch.nn.attention.flex_attention import flex_attention logger = logging.get_logger(__name__) @@ -168,6 +169,161 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) +def eager_attention_forward( + config: Gemma2Config, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: Optional[torch.Tensor], + **_kwargs, +) -> Tuple[torch.Tensor, torch.Tensor]: + key_states = repeat_kv(key, config.num_key_value_groups) + value_states = repeat_kv(value, config.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * config.scaling + + if config.attn_logit_softcapping is not None: + attn_weights = attn_weights / config.attn_logit_softcapping + attn_weights = torch.tanh(attn_weights) + attn_weights = attn_weights * config.attn_logit_softcapping + if mask is not None: # no matter the length, we just slice it + causal_mask = mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=config.attention_dropout, training=config.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + return attn_output, attn_weights + + +def flash_attention_forward( + config: Gemma2Config, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: Optional[torch.Tensor], + target_dtype: torch.dtype = torch.float16, + **_kwargs, +) -> Tuple[torch.Tensor, None]: + if mask is not None: + seq_len = mask.shape[1] + query = query[:, :, :seq_len] + value = value[:, :, :seq_len] + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout + # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor rotary embedding + query_states = query.transpose(1, 2) + key_states = key.transpose(1, 2) + value_states = value.transpose(1, 2) + + dropout_rate = config.attention_dropout if config.training else 0.0 + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = _flash_attention_forward( + query_states, + key_states, + value_states, + mask, + seq_len, + dropout=dropout_rate, + softmax_scale=config.scaling, + is_causal=config.is_causal, + sliding_window=config.sliding_window, + use_top_left_mask=config._flash_attn_uses_top_left_mask, + softcap=config.attn_logit_softcapping if is_flash_attn_greater_or_equal("2.6.0") else None, + ) + + return attn_output, None + + +def flex_attention_forward( + config: Gemma2Config, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: Optional[torch.Tensor], + output_attentions: bool = False, + **_kwargs, +) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + def tanh_softcap(score, b, h, q_idx, kv_idx): + soft_cap = config.attn_logit_softcapping + score = soft_cap * torch.tanh(score / soft_cap) + if mask is not None: + return score + mask[b][0][q_idx][kv_idx] + return score + + attn_output = flex_attention( + query, + key, + value, + score_mod=tanh_softcap, + enable_gqa=True, + scale=config.scaling, + return_lse=output_attentions, + ) + if not output_attentions: + attn_weights = None + else: + attn_output, attn_weights = attn_output + + attn_output = attn_output.transpose(1, 2).contiguous() + return attn_output, attn_weights + + +def sdpa_attention_forward( + config: Gemma2Config, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: Optional[torch.Tensor], + **_kwargs, +) -> Tuple[torch.Tensor, None]: + key = repeat_kv(key, config.num_key_value_groups) + value = repeat_kv(value, config.num_key_value_groups) + + causal_mask = mask + if mask is not None: + causal_mask = causal_mask[:, :, :, : key.shape[-2]] + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query.device.type == "cuda" and causal_mask is not None: + query = query.contiguous() + key = key.contiguous() + value = value.contiguous() + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + is_causal = True if causal_mask is None and query.shape[1] > 1 else False + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query, + key, + value, + attn_mask=causal_mask, + dropout_p=config.attention_dropout if config.training else 0.0, + is_causal=is_causal, + scale=config.scaling, + ) + attn_output = attn_output.transpose(1, 2).contiguous() + return attn_output, None + + +GEMMA2_ATTENTION_FUNCTION = { + "flash_attention_2": flash_attention_forward, + "flex_attention": flex_attention_forward, + "eager": eager_attention_forward, + "sdpa": sdpa_attention_forward, +} + + class Gemma2Attention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" @@ -175,12 +331,6 @@ def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None): super().__init__() self.config = config self.layer_idx = layer_idx - if layer_idx is None: - logger.warning_once( - f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " - "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " - "when creating this class." - ) self.attention_dropout = config.attention_dropout self.hidden_size = config.hidden_size @@ -192,7 +342,8 @@ def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None): self.rope_theta = config.rope_theta self.is_causal = True self.scaling = config.query_pre_attn_scalar**-0.5 - + self.sliding_window = config.sliding_window if not bool(layer_idx % 2) else None + self.attn_logit_softcapping = config.attn_logit_softcapping if self.hidden_size % self.num_heads != 0: raise ValueError( f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" @@ -208,7 +359,6 @@ def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None): max_position_embeddings=self.max_position_embeddings, base=self.rope_theta, ) - self.sliding_window = config.sliding_window if not bool(layer_idx % 2) else None def forward( self, @@ -243,33 +393,17 @@ def forward( } key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling - - if self.config.attn_logit_softcapping is not None: - attn_weights = attn_weights / self.config.attn_logit_softcapping - attn_weights = torch.tanh(attn_weights) - attn_weights = attn_weights * self.config.attn_logit_softcapping - if attention_mask is not None: # no matter the length, we just slice it - causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] - attn_weights = attn_weights + causal_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) - attn_output = torch.matmul(attn_weights, value_states) - - if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) + if output_attentions and self.config._attn_implementation in ["sdpa", "flash_attention_2"]: + logger.warning_once("Setting `attention_type` to `flex_attention` because `output_attentions=True`") + attention_type = "flex_attention" + else: + attention_type = self.config._attn_implementation - attn_output = attn_output.transpose(1, 2).contiguous() + attn_output, attn_weights = GEMMA2_ATTENTION_FUNCTION[attention_type]( + self, query_states, key_states, value_states, attention_mask, output_attentions=output_attentions + ) - attn_output = attn_output.view(bsz, q_len, -1) + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() attn_output = self.o_proj(attn_output) if not output_attentions: @@ -279,230 +413,36 @@ def forward( class Gemma2FlashAttention2(Gemma2Attention): - """ - Gemma2 flash attention module. This module inherits from `Gemma2Attention` as the weights of the module stays - untouched. The only required change would be on the forward pass where it needs to correctly call the public API of - flash attention and deal with padding tokens in case the input contains any of them. - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. - # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. - # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). - self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.LongTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - cache_position: Optional[torch.LongTensor] = None, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - output_attentions = False - - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # Flash attention requires the input to have the shape - # batch_size x seq_length x head_dim x hidden_dim - # therefore we just need to keep the original shape - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - cos, sin = self.rotary_emb(value_states, position_ids) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) - - if past_key_value is not None: - # sin and cos are specific to RoPE models; cache_position needed for the static cache - cache_kwargs = { - "sin": sin, - "cos": cos, - "sliding_window": self.sliding_window, - "cache_position": cache_position, - } - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - if attention_mask is not None: - seq_len = attention_mask.shape[1] - key_states = key_states[:, :, :seq_len] - value_states = value_states[:, :, :seq_len] - - # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache - # to be able to avoid many of these transpose/reshape/view. - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - dropout_rate = self.attention_dropout if self.training else 0.0 - - # In PEFT, usually we cast the layer norms in float32 for training stability reasons - # therefore the input hidden states gets silently casted in float32. Hence, we need - # cast them back in the correct dtype just to be sure everything works as expected. - # This might slowdown training & inference so it is recommended to not cast the LayerNorms - # in fp32. (Gemma2RMSNorm handles it correctly) - - input_dtype = query_states.dtype - if input_dtype == torch.float32: - if torch.is_autocast_enabled(): - target_dtype = torch.get_autocast_gpu_dtype() - # Handle the case where the model is quantized - elif hasattr(self.config, "_pre_quantization_dtype"): - target_dtype = self.config._pre_quantization_dtype - else: - target_dtype = self.q_proj.weight.dtype - - logger.warning_once( - f"The input hidden states seems to be silently casted in float32, this might be related to" - f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" - f" {target_dtype}." - ) - - query_states = query_states.to(target_dtype) - key_states = key_states.to(target_dtype) - value_states = value_states.to(target_dtype) - - attn_output = _flash_attention_forward( - query_states, - key_states, - value_states, - attention_mask, - q_len, - dropout=dropout_rate, - softmax_scale=self.scaling, - is_causal=self.is_causal, - sliding_window=self.sliding_window, - use_top_left_mask=self._flash_attn_uses_top_left_mask, - softcap=self.config.attn_logit_softcapping if is_flash_attn_greater_or_equal("2.6.0") else None, + def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None): + super().__init__(config, layer_idx) + self.config._attn_implementation = "flash_attention_2" + logger.warning_once( + "The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`" + "attribute of the `GemmaAttention` class! It will be removed in v4.48" ) - attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - class Gemma2SdpaAttention(Gemma2Attention): - """ - Gemma2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from - `Gemma2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to - SDPA API. - """ - - # Adapted from Gemma2Attention.forward - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - cache_position: Optional[torch.LongTensor] = None, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if output_attentions: - # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. - logger.warning_once( - "Gemma2Model is using Gemma2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " - 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - cache_position=cache_position, - ) - - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - cos, sin = self.rotary_emb(value_states, position_ids) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) - - if past_key_value is not None: - # sin and cos are specific to RoPE models; cache_position needed for the static cache - cache_kwargs = { - "sin": sin, - "cos": cos, - "sliding_window": self.sliding_window, - "cache_position": cache_position, - } - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - causal_mask = attention_mask - if attention_mask is not None: - causal_mask = causal_mask[:, :, :, : key_states.shape[-2]] - - # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, - # Reference: https://github.com/pytorch/pytorch/issues/112577. - if query_states.device.type == "cuda" and causal_mask is not None: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment - # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. - is_causal = True if causal_mask is None and q_len > 1 else False - - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=causal_mask, - dropout_p=self.attention_dropout if self.training else 0.0, - is_causal=is_causal, - scale=self.scaling, + def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None): + super().__init__(config, layer_idx) + self.config._attn_implementation = "sdpa" + logger.warning_once( + "The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`" + "attribute of the `GemmaAttention` class! It will be removed in v4.48" ) - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.view(bsz, q_len, -1) - - attn_output = self.o_proj(attn_output) - - return attn_output, None, past_key_value - - -GEMMA2_ATTENTION_CLASSES = { - "eager": Gemma2Attention, - "flash_attention_2": Gemma2FlashAttention2, - "sdpa": Gemma2SdpaAttention, -} - class Gemma2DecoderLayer(nn.Module): def __init__(self, config: Gemma2Config, layer_idx: int): super().__init__() self.hidden_size = config.hidden_size - self.self_attn = GEMMA2_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx) + self.config = config + self.is_sliding = not bool(layer_idx % 2) + self.self_attn = Gemma2Attention(config=config, layer_idx=layer_idx) self.mlp = Gemma2MLP(config) self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.config = config - self.is_sliding = not bool(layer_idx % 2) + self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.sliding_window = config.sliding_window @@ -517,25 +457,6 @@ def forward( use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: - """ - Args: - hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` - attention_mask (`torch.FloatTensor`, *optional*): - attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, - query_sequence_length, key_sequence_length)` if default attention is used. - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - use_cache (`bool`, *optional*): - If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding - (see `past_key_values`). - past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states - cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): - Indices depicting the position of the input sequence tokens in the sequence - kwargs (`dict`, *optional*): - Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code - into the model - """ if self.is_sliding and attention_mask is not None: # efficient SDPA and no padding # Flash-attn is a 2D tensor if self.config._attn_implementation == "flash_attention_2": @@ -740,7 +661,10 @@ def __init__(self, config: Gemma2Config): [Gemma2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.norm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.gradient_checkpointing = False + if getattr(config, "pretraining_tp", 1) != 1: + logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.") # Initialize weights and apply final processing self.post_init() @@ -820,7 +744,7 @@ def forward( all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None - for decoder_layer in self.layers: + for decoder_layer in self.layers[: self.config.num_hidden_layers]: if output_hidden_states: all_hidden_states += (hidden_states,) @@ -961,6 +885,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( class Gemma2ForCausalLM(Gemma2PreTrainedModel, GenerationMixin): _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index dacaca1c7ef4a9..7236ae2f5c9f87 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -29,18 +29,17 @@ from ...utils import ( is_flash_attn_2_available, is_flash_attn_greater_or_equal, - is_flash_attn_greater_or_equal_2_10, + is_torch_greater_or_equal, logging, ) from ..gemma.modeling_gemma import ( - GemmaAttention, - GemmaDecoderLayer, GemmaForCausalLM, GemmaForSequenceClassification, GemmaForTokenClassification, GemmaModel, GemmaPreTrainedModel, GemmaRMSNorm, + GemmaRotaryEmbedding, apply_rotary_pos_emb, repeat_kv, ) @@ -49,6 +48,9 @@ if is_flash_attn_2_available(): from ...modeling_flash_attention_utils import _flash_attention_forward +if is_torch_greater_or_equal("2.5"): + from torch.nn.attention.flex_attention import flex_attention + _CHECKPOINT_FOR_DOC = "google/gemma2-7b" @@ -207,118 +209,217 @@ def forward(self, x): return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) -class Gemma2Attention(GemmaAttention): +class Gemma2RotaryEmbedding(GemmaRotaryEmbedding): + pass + + +def eager_attention_forward( + config: Gemma2Config, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: Optional[torch.Tensor], + **_kwargs, +) -> Tuple[torch.Tensor, torch.Tensor]: + key_states = repeat_kv(key, config.num_key_value_groups) + value_states = repeat_kv(value, config.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * config.scaling + + if config.attn_logit_softcapping is not None: + attn_weights = attn_weights / config.attn_logit_softcapping + attn_weights = torch.tanh(attn_weights) + attn_weights = attn_weights * config.attn_logit_softcapping + if mask is not None: # no matter the length, we just slice it + causal_mask = mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=config.attention_dropout, training=config.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + return attn_output, attn_weights + + +def flash_attention_forward( + config: Gemma2Config, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: Optional[torch.Tensor], + target_dtype: torch.dtype = torch.float16, + **_kwargs, +) -> Tuple[torch.Tensor, None]: + if mask is not None: + seq_len = mask.shape[1] + query = query[:, :, :seq_len] + value = value[:, :, :seq_len] + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout + # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor rotary embedding + query_states = query.transpose(1, 2) + key_states = key.transpose(1, 2) + value_states = value.transpose(1, 2) + + dropout_rate = config.attention_dropout if config.training else 0.0 + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = _flash_attention_forward( + query_states, + key_states, + value_states, + mask, + seq_len, + dropout=dropout_rate, + softmax_scale=config.scaling, + is_causal=config.is_causal, + sliding_window=config.sliding_window, + use_top_left_mask=config._flash_attn_uses_top_left_mask, + softcap=config.attn_logit_softcapping if is_flash_attn_greater_or_equal("2.6.0") else None, + ) + + return attn_output, None + + +def flex_attention_forward( + config: Gemma2Config, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: Optional[torch.Tensor], + output_attentions: bool = False, + **_kwargs, +) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + def tanh_softcap(score, b, h, q_idx, kv_idx): + soft_cap = config.attn_logit_softcapping + score = soft_cap * torch.tanh(score / soft_cap) + if mask is not None: + return score + mask[b][0][q_idx][kv_idx] + return score + + attn_output = flex_attention( + query, + key, + value, + score_mod=tanh_softcap, + enable_gqa=True, + scale=config.scaling, + return_lse=output_attentions, + ) + if not output_attentions: + attn_weights = None + else: + attn_output, attn_weights = attn_output + + attn_output = attn_output.transpose(1, 2).contiguous() + return attn_output, attn_weights + + +def sdpa_attention_forward( + config: Gemma2Config, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + mask: Optional[torch.Tensor], + **_kwargs, +) -> Tuple[torch.Tensor, None]: + key = repeat_kv(key, config.num_key_value_groups) + value = repeat_kv(value, config.num_key_value_groups) + + causal_mask = mask + if mask is not None: + causal_mask = causal_mask[:, :, :, : key.shape[-2]] + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query.device.type == "cuda" and causal_mask is not None: + query = query.contiguous() + key = key.contiguous() + value = value.contiguous() + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + is_causal = True if causal_mask is None and query.shape[1] > 1 else False + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query, + key, + value, + attn_mask=causal_mask, + dropout_p=config.attention_dropout if config.training else 0.0, + is_causal=is_causal, + scale=config.scaling, + ) + attn_output = attn_output.transpose(1, 2).contiguous() + return attn_output, None + + +GEMMA2_ATTENTION_FUNCTION = { + "flash_attention_2": flash_attention_forward, + "flex_attention": flex_attention_forward, + "eager": eager_attention_forward, + "sdpa": sdpa_attention_forward, +} + + +class Gemma2Attention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None): - super().__init__(config, layer_idx) + super().__init__() + self.config = config + self.layer_idx = layer_idx + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = config.head_dim + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True self.scaling = config.query_pre_attn_scalar**-0.5 self.sliding_window = config.sliding_window if not bool(layer_idx % 2) else None - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - cache_position: Optional[torch.LongTensor] = None, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - cos, sin = self.rotary_emb(value_states, position_ids) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) - - if past_key_value is not None: - # sin and cos are specific to RoPE models; cache_position needed for the static cache - cache_kwargs = { - "sin": sin, - "cos": cos, - "sliding_window": self.sliding_window, - "cache_position": cache_position, - } - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling - - if self.config.attn_logit_softcapping is not None: - attn_weights = attn_weights / self.config.attn_logit_softcapping - attn_weights = torch.tanh(attn_weights) - attn_weights = attn_weights * self.config.attn_logit_softcapping - if attention_mask is not None: # no matter the length, we just slice it - causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] - attn_weights = attn_weights + causal_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) - attn_output = torch.matmul(attn_weights, value_states) - - if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + self.attn_logit_softcapping = config.attn_logit_softcapping + if self.hidden_size % self.num_heads != 0: raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" - f" {attn_output.size()}" + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." ) - attn_output = attn_output.transpose(1, 2).contiguous() - - attn_output = attn_output.view(bsz, q_len, -1) - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - -class Gemma2FlashAttention2(Gemma2Attention): - """ - Gemma2 flash attention module. This module inherits from `Gemma2Attention` as the weights of the module stays - untouched. The only required change would be on the forward pass where it needs to correctly call the public API of - flash attention and deal with padding tokens in case the input contains any of them. - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. - # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. - # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). - self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias) + self.rotary_emb = Gemma2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) def forward( self, hidden_states: torch.Tensor, - attention_mask: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_value: Optional[Cache] = None, output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - output_attentions = False - bsz, q_len, _ = hidden_states.size() query_states = self.q_proj(hidden_states) key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - # Flash attention requires the input to have the shape - # batch_size x seq_length x head_dim x hidden_dim - # therefore we just need to keep the original shape query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) @@ -336,57 +437,14 @@ def forward( } key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - if attention_mask is not None: - seq_len = attention_mask.shape[1] - key_states = key_states[:, :, :seq_len] - value_states = value_states[:, :, :seq_len] - - # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache - # to be able to avoid many of these transpose/reshape/view. - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - dropout_rate = self.attention_dropout if self.training else 0.0 - - # In PEFT, usually we cast the layer norms in float32 for training stability reasons - # therefore the input hidden states gets silently casted in float32. Hence, we need - # cast them back in the correct dtype just to be sure everything works as expected. - # This might slowdown training & inference so it is recommended to not cast the LayerNorms - # in fp32. (Gemma2RMSNorm handles it correctly) - - input_dtype = query_states.dtype - if input_dtype == torch.float32: - if torch.is_autocast_enabled(): - target_dtype = torch.get_autocast_gpu_dtype() - # Handle the case where the model is quantized - elif hasattr(self.config, "_pre_quantization_dtype"): - target_dtype = self.config._pre_quantization_dtype - else: - target_dtype = self.q_proj.weight.dtype - - logger.warning_once( - f"The input hidden states seems to be silently casted in float32, this might be related to" - f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" - f" {target_dtype}." - ) - - query_states = query_states.to(target_dtype) - key_states = key_states.to(target_dtype) - value_states = value_states.to(target_dtype) + if output_attentions and self.config._attn_implementation in ["sdpa", "flash_attention_2"]: + logger.warning_once("Setting `attention_type` to `flex_attention` because `output_attentions=True`") + attention_type = "flex_attention" + else: + attention_type = self.config._attn_implementation - attn_output = _flash_attention_forward( - query_states, - key_states, - value_states, - attention_mask, - q_len, - dropout=dropout_rate, - softmax_scale=self.scaling, - is_causal=self.is_causal, - sliding_window=self.sliding_window, - use_top_left_mask=self._flash_attn_uses_top_left_mask, - softcap=self.config.attn_logit_softcapping if is_flash_attn_greater_or_equal("2.6.0") else None, + attn_output, attn_weights = GEMMA2_ATTENTION_FUNCTION[attention_type]( + self, query_states, key_states, value_states, attention_mask, output_attentions=output_attentions ) attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() @@ -398,105 +456,37 @@ def forward( return attn_output, attn_weights, past_key_value -class Gemma2SdpaAttention(Gemma2Attention): - """ - Gemma2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from - `Gemma2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to - SDPA API. - """ - - # Adapted from Gemma2Attention.forward - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - cache_position: Optional[torch.LongTensor] = None, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - if output_attentions: - # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. - logger.warning_once( - "Gemma2Model is using Gemma2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " - 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_value=past_key_value, - output_attentions=output_attentions, - use_cache=use_cache, - cache_position=cache_position, - ) - - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - cos, sin = self.rotary_emb(value_states, position_ids) - query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) - - if past_key_value is not None: - # sin and cos are specific to RoPE models; cache_position needed for the static cache - cache_kwargs = { - "sin": sin, - "cos": cos, - "sliding_window": self.sliding_window, - "cache_position": cache_position, - } - key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) - - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - causal_mask = attention_mask - if attention_mask is not None: - causal_mask = causal_mask[:, :, :, : key_states.shape[-2]] - - # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, - # Reference: https://github.com/pytorch/pytorch/issues/112577. - if query_states.device.type == "cuda" and causal_mask is not None: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment - # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. - is_causal = True if causal_mask is None and q_len > 1 else False - - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=causal_mask, - dropout_p=self.attention_dropout if self.training else 0.0, - is_causal=is_causal, - scale=self.scaling, +class Gemma2FlashAttention2(Gemma2Attention): + def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None): + super().__init__(config, layer_idx) + self.config._attn_implementation = "flash_attention_2" + logger.warning_once( + "The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`" + "attribute of the `GemmaAttention` class! It will be removed in v4.48" ) - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.view(bsz, q_len, -1) - - attn_output = self.o_proj(attn_output) - return attn_output, None, past_key_value +class Gemma2SdpaAttention(Gemma2Attention): + def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None): + super().__init__(config, layer_idx) + self.config._attn_implementation = "sdpa" + logger.warning_once( + "The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`" + "attribute of the `GemmaAttention` class! It will be removed in v4.48" + ) -class Gemma2DecoderLayer(GemmaDecoderLayer): +class Gemma2DecoderLayer(nn.Module): def __init__(self, config: Gemma2Config, layer_idx: int): - super().__init__(config, layer_idx) + super().__init__() + self.hidden_size = config.hidden_size self.config = config self.is_sliding = not bool(layer_idx % 2) + self.self_attn = Gemma2Attention(config=config, layer_idx=layer_idx) self.mlp = Gemma2MLP(config) + self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.sliding_window = config.sliding_window @@ -653,7 +643,7 @@ def forward( all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None - for decoder_layer in self.layers: + for decoder_layer in self.layers[: self.config.num_hidden_layers]: if output_hidden_states: all_hidden_states += (hidden_states,) diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py index 85d32a7c691a18..de0e80e8c65ba9 100644 --- a/src/transformers/models/glm/configuration_glm.py +++ b/src/transformers/models/glm/configuration_glm.py @@ -45,6 +45,7 @@ class GlmConfig(PretrainedConfig): by meanpooling all the original heads within that group. For more details checkout [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `num_attention_heads`. + partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position. head_dim (`int`, *optional*, defaults to 128): The attention head dimension. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): @@ -93,6 +94,7 @@ def __init__( num_hidden_layers=40, num_attention_heads=32, num_key_value_heads=2, + partial_rotary_factor=0.5, head_dim=128, hidden_act="silu", attention_dropout=0.0, @@ -114,6 +116,7 @@ def __init__( self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads + self.partial_rotary_factor = partial_rotary_factor self.head_dim = head_dim self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act diff --git a/src/transformers/models/glm/convert_glm_weights_to_hf.py b/src/transformers/models/glm/convert_glm_weights_to_hf.py index 3878ce0d25814a..1053f984d7f053 100644 --- a/src/transformers/models/glm/convert_glm_weights_to_hf.py +++ b/src/transformers/models/glm/convert_glm_weights_to_hf.py @@ -37,16 +37,28 @@ # fmt: on -def merge_safetensors(input_dir: str): - all_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".safetensors")] - all_files = sorted(all_files, key=lambda x: int(x.rsplit("-", 3)[1])) +def load_weights(input_dir: str): + safetensor_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".safetensors")] + bin_files = [os.path.join(input_dir, x) for x in os.listdir(input_dir) if x.endswith(".bin")] all_weights = {} - for file in all_files: - tensors = load_file(file) - all_weights.update(tensors) - return all_weights + if safetensor_files: + safetensor_files = sorted(safetensor_files, key=lambda x: int(x.rsplit("-", 3)[1])) + for file in safetensor_files: + tensors = load_file(file) + all_weights.update(tensors) + return all_weights + + elif bin_files: + bin_files = sorted(bin_files, key=lambda x: int(x.rsplit("-", 3)[1])) + for file in bin_files: + tensors = torch.load(file, map_location="cpu") + all_weights.update(tensors) + return all_weights + + else: + raise ValueError("No .safetensors or .bin files found in the specified directory.") def map_old_key_to_new(old_key): @@ -100,7 +112,8 @@ def convert_config(original_config: dict): "attention_bias": "add_qkv_bias", } similar_keys_to_keep = [ - "num_attention_heads" "hidden_size", + "num_attention_heads", + "hidden_size", "attention_dropout", "use_cache", "eos_token_id", @@ -120,24 +133,27 @@ def convert_config(original_config: dict): return new_config -def convert_glm_tokenizer(input_dir): +def convert_glm_tokenizer(input_dir, use_post_processor=False): fast_tok = PreTrainedTokenizerFast.from_pretrained(input_dir, model_input_names=["input_ids", "attention_mask"]) - # Add the two tokens automatically with post processor - fast_tok._tokenizer.post_processor = processors.Sequence( - [ - processors.ByteLevel(trim_offsets=False), - processors.TemplateProcessing( - single="[gMASK]:0 :0 $A:0", - pair="[gMASK]:0 :0 $A:0 $B:1", - special_tokens=[("[gMASK]", 151331), ("", 151333)], - ), - ], - ) - + if use_post_processor: + fast_tok._tokenizer.post_processor = processors.Sequence( + [ + processors.ByteLevel(trim_offsets=False), + processors.TemplateProcessing( + single="[gMASK]:0 :0 $A:0", + pair="[gMASK]:0 :0 $A:0 $B:1", + special_tokens=[("[gMASK]", 151331), ("", 151333)], + ), + ], + ) + else: + fast_tok._tokenizer.post_processor = processors.Sequence( + [processors.ByteLevel(trim_offsets=False)], + ) return fast_tok -def convert_glm_model(input_dir, output_dir): +def convert_glm_model(input_dir, output_dir, use_post_processor=False): # Load and convert config with open(os.path.join(input_dir, "config.json")) as f: original_config = json.load(f) @@ -145,7 +161,7 @@ def convert_glm_model(input_dir, output_dir): config.save_pretrained(output_dir) # Load and convert weights - original_state_dict = merge_safetensors(input_dir) + original_state_dict = load_weights(input_dir) new_dict = convert_state_dict(original_state_dict, config) with torch.device("meta"): model = GlmForCausalLM(config) @@ -153,7 +169,7 @@ def convert_glm_model(input_dir, output_dir): model.save_pretrained(output_dir) # Load and convert tokenizer - tokenizer = convert_glm_tokenizer(input_dir) + tokenizer = convert_glm_tokenizer(input_dir, use_post_processor) tokenizer.save_pretrained(output_dir) @@ -169,6 +185,11 @@ def convert_glm_model(input_dir, output_dir): type=str, help="Location to write HF model and tokenizer", ) + parser.add_argument( + "--use_post_processor", + action="store_true", + help="Whether to apply post processor with special tokens", + ) args = parser.parse_args() - convert_glm_model(args.input_dir, args.output_dir) + convert_glm_model(args.input_dir, args.output_dir, args.use_post_processor) diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py index 248ec4021791b1..16a724f69464a9 100644 --- a/src/transformers/models/glm/modeling_glm.py +++ b/src/transformers/models/glm/modeling_glm.py @@ -169,13 +169,14 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1) sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1) - # Keep half for later concatenation - q, q_pass = q[..., : q.shape[-1] // 2], q[..., q.shape[-1] // 2 :] - k, k_pass = k[..., : k.shape[-1] // 2], k[..., k.shape[-1] // 2 :] + # Keep half or full tensor for later concatenation + rotary_dim = cos.shape[-1] + q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:] + k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:] - # Apply rotary embeddings on the first half - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) + # Apply rotary embeddings on the first half or full tensor + q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin) + k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin) # Concatenate back to full shape q_embed = torch.cat([q_embed, q_pass], dim=-1) @@ -705,9 +706,13 @@ def __init__(self, config: GlmConfig): ) self.norm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.rotary_emb = GlmRotaryEmbedding( - dim=config.head_dim // 2, max_position_embeddings=config.max_position_embeddings, base=config.rope_theta + dim=int(config.head_dim * config.partial_rotary_factor), + max_position_embeddings=config.max_position_embeddings, + base=config.rope_theta, ) self.gradient_checkpointing = False + if getattr(config, "pretraining_tp", 1) != 1: + logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.") # Initialize weights and apply final processing self.post_init() @@ -787,7 +792,7 @@ def forward( all_self_attns = () if output_attentions else None next_decoder_cache = None - for decoder_layer in self.layers: + for decoder_layer in self.layers[: self.config.num_hidden_layers]: if output_hidden_states: all_hidden_states += (hidden_states,) @@ -967,6 +972,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( class GlmForCausalLM(GlmPreTrainedModel, GenerationMixin): _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} def __init__(self, config: GlmConfig): super().__init__(config) diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py index 39ee4a2ad5803e..48605c15d30be3 100644 --- a/src/transformers/models/glm/modular_glm.py +++ b/src/transformers/models/glm/modular_glm.py @@ -95,13 +95,14 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1) sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1) - # Keep half for later concatenation - q, q_pass = q[..., : q.shape[-1] // 2], q[..., q.shape[-1] // 2 :] - k, k_pass = k[..., : k.shape[-1] // 2], k[..., k.shape[-1] // 2 :] + # Keep half or full tensor for later concatenation + rotary_dim = cos.shape[-1] + q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:] + k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:] - # Apply rotary embeddings on the first half - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) + # Apply rotary embeddings on the first half or full tensor + q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin) + k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin) # Concatenate back to full shape q_embed = torch.cat([q_embed, q_pass], dim=-1) @@ -152,7 +153,9 @@ def __init__(self, config: GlmConfig): ) self.norm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.rotary_emb = GlmRotaryEmbedding( - dim=config.head_dim // 2, max_position_embeddings=config.max_position_embeddings, base=config.rope_theta + dim=int(config.head_dim * config.partial_rotary_factor), + max_position_embeddings=config.max_position_embeddings, + base=config.rope_theta, ) self.gradient_checkpointing = False diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index 3e0c52e89a93d5..58143192c20482 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -1101,7 +1101,8 @@ def forward( all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None all_hidden_states = () if output_hidden_states else None - for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): + for i in range(len(self.h)): + block, layer_past = self.h[i], past_key_values[i] # Model parallel if self.model_parallel: torch.cuda.set_device(hidden_states.device) diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index 359996983eed74..3fdb814ebab51a 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -18,7 +18,6 @@ import torch import torch.utils.checkpoint -from packaging import version from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss @@ -42,9 +41,9 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import PreTrainedModel from ...utils import ( - get_torch_version, is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, + is_torch_flex_attn_available, logging, ) from .configuration_gpt_neox import GPTNeoXConfig @@ -53,6 +52,9 @@ if is_flash_attn_2_available(): from ...modeling_flash_attention_utils import _flash_attention_forward +if is_torch_flex_attn_available(): + from torch.nn.attention.flex_attention import flex_attention + logger = logging.get_logger(__name__) _CHECKPOINT_FOR_DOC = "trl-internal-testing/tiny-random-GPTNeoXForCausalLM" @@ -76,6 +78,7 @@ class GPTNeoXPreTrainedModel(PreTrainedModel): _supports_quantized_cache = True _supports_static_cache = True _supports_sdpa = True + _supports_flex_attn = True def _init_weights(self, module): """Initialize the weights""" @@ -92,6 +95,169 @@ def _init_weights(self, module): module.weight.data.fill_(1.0) +def eager_attention_forward( + query, key, value, attention_mask, head_mask, norm_factor, attention_dropout, training, **_kwargs +): + # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size] + batch_size, num_attention_heads, query_length, attn_head_size = query.size() + key_length = key.size(-2) + + query = query.view(batch_size * num_attention_heads, query_length, attn_head_size) + key = key.view(batch_size * num_attention_heads, key_length, attn_head_size) + attn_scores = torch.zeros( + batch_size * num_attention_heads, + query_length, + key_length, + dtype=query.dtype, + device=key.device, + ) + attn_scores = torch.baddbmm( + attn_scores, + query, + key.transpose(1, 2), + beta=1.0, + alpha=norm_factor, + ) + attn_scores = attn_scores.view(batch_size, num_attention_heads, query_length, key_length) + + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key.shape[-2]] + attn_scores = attn_scores + causal_mask + + attn_weights = nn.functional.softmax(attn_scores, dim=-1) + attn_weights = attn_weights.to(value.dtype) + + # Mask heads if we want to + if head_mask is not None: + attn_weights = attn_weights * head_mask + + attn_weights = nn.functional.dropout(attn_weights, p=attention_dropout, training=training) + attn_output = torch.matmul(attn_weights, value) + + # Reshape outputs + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +def flash_attention_forward( + query, + key, + value, + attention_mask, + norm_factor, + attention_dropout, + training, + target_dtype=None, + **_kwargs, +): + query_length = query.shape[-2] + + # GPT-neo-X casts query and key in fp32 to apply rotary embedding in full precision + query = query.to(value.dtype) + key = key.to(value.dtype) + + # Permute to get the expected shape for Flash Attention + query = query.transpose(1, 2) + key = key.transpose(1, 2) + value = value.transpose(1, 2) + + attention_dropout = attention_dropout if training else 0.0 + flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + # Compute attention + attn_output = _flash_attention_forward( + query, + key, + value, + attention_mask, + query_length, + dropout=attention_dropout, + softmax_scale=norm_factor, + is_causal=True, + use_top_left_mask=flash_attn_uses_top_left_mask, + target_dtype=target_dtype, + ) + + return attn_output, None + + +def sdpa_attention_forward(query, key, value, attention_mask, attention_dropout, training, **_kwargs): + q_len = query.shape[-2] + + causal_mask = attention_mask + if attention_mask is not None: + causal_mask = causal_mask[:, :, :, : key.shape[-2]] + + # GPT-neo-X casts query and key in fp32 to apply rotary embedding in full precision + query = query.to(value.dtype) + key = key.to(value.dtype) + + # Avoid torch==2.1.2 specific bug for the memory-efficient backend in SDPA + query = query.contiguous() + key = key.contiguous() + value = value.contiguous() + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + is_causal = True if causal_mask is None and q_len > 1 else False + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query=query, + key=key, + value=value, + attn_mask=causal_mask, + dropout_p=attention_dropout if training else 0.0, + is_causal=is_causal, + ) + + # Reshape outputs + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, None + + +def flex_attention_forward(query, key, value, attention_mask, head_mask, norm_factor, **_kwargs): + causal_mask = attention_mask + if causal_mask is not None: + causal_mask = causal_mask[:, :, :, : key.shape[-2]] + + def causal_mod(score, b, h, q_idx, kv_idx): + if causal_mask is not None: + score += causal_mask[b][0][q_idx][kv_idx] + if head_mask is not None: + score += head_mask[b][h][0][0] + return score + + attn_output, attn_weights = flex_attention( + query, + key, + value, + score_mod=causal_mod, + enable_gqa=True, + scale=norm_factor, + # Last time checked on PyTorch == 2.5.1: Flex Attention always computes the lse regardless. + # For simplification, we thus always return it as no additional computations are introduced. + return_lse=True, + ) + + # lse is returned in float32 + attn_weights = attn_weights.to(value.dtype) + + # Reshape outputs + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +GPTNEOX_ATTENTION_FUNCTION = { + "eager": eager_attention_forward, + "flash_attention_2": flash_attention_forward, + "sdpa": sdpa_attention_forward, + "flex_attention": flex_attention_forward, +} + + class GPTNeoXAttention(nn.Module): def __init__(self, config, layer_idx=None): super().__init__() @@ -147,20 +313,57 @@ def forward( cache_position: Optional[torch.LongTensor] = None, position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 ): + bsz, seq_len, _ = hidden_states.shape + # Apply attention-specific projections and rope query, key, value, present = self._attn_projections_and_rope( hidden_states=hidden_states, position_ids=position_ids, layer_past=layer_past, use_cache=use_cache, + cache_position=cache_position, position_embeddings=position_embeddings, ) + # Checking for fallbacks in case an unsupported feature is requested + attention_type = self.config._attn_implementation + if (output_attentions or head_mask is not None) and self.config._attn_implementation in [ + "sdpa", + "flash_attention_2", + ]: + logger.warning_once( + f"Setting `attention_type` to `eager` because `{attention_type}` does not support" + f" `output_attentions=True` or `head_mask`." + ) + attention_type = "eager" + + elif ( + self.training + and self.config.attention_dropout > 0 + and self.config._attn_implementation == "flex_attention" + ): + logger.warning_once( + f"Setting `attention_type` to `eager` because `dropout` is not supported in `{attention_type}`." + ) + attention_type = "eager" + # Compute attention - attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask) + attn_output, attn_weights = GPTNEOX_ATTENTION_FUNCTION[attention_type]( + query, + key, + value, + attention_mask=attention_mask, + head_mask=head_mask, + norm_factor=self.norm_factor, + attention_dropout=self.config.attention_dropout, + training=self.training, + # Flash Attention 2 specific PEFT check + target_dtype=self._fa_peft_dtype_check(value), + ) - # Reshape outputs - attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_size) + # Reshape outputs and final projection + attn_output = attn_output.contiguous() + attn_output = attn_output.view(bsz, seq_len, -1) attn_output = self.dense(attn_output) outputs = (attn_output, present) @@ -250,262 +453,47 @@ def _attn_projections_and_rope( return query, key, value, layer_past - def _attn(self, query, key, value, attention_mask=None, head_mask=None): - # q, k, v: [bs, num_attention_heads, seq_len, attn_head_size] - # compute causal mask from causal mask buffer - batch_size, num_attention_heads, query_length, attn_head_size = query.size() - key_length = key.size(-2) - - # dynamically increase the causal mask with the key length, if needed. - if key_length > self.bias.shape[-1]: - self._init_bias(key_length, device=key.device) - causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length] - - query = query.view(batch_size * num_attention_heads, query_length, attn_head_size) - key = key.view(batch_size * num_attention_heads, key_length, attn_head_size) - attn_scores = torch.zeros( - batch_size * num_attention_heads, - query_length, - key_length, - dtype=query.dtype, - device=key.device, - ) - attn_scores = torch.baddbmm( - attn_scores, - query, - key.transpose(1, 2), - beta=1.0, - alpha=self.norm_factor, - ) - attn_scores = attn_scores.view(batch_size, num_attention_heads, query_length, key_length) - - mask_value = torch.finfo(attn_scores.dtype).min - # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`. - # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device` - mask_value = torch.tensor(mask_value, dtype=attn_scores.dtype).to(attn_scores.device) - attn_scores = torch.where(causal_mask, attn_scores, mask_value) - - if attention_mask is not None: # no matter the length, we just slice it - causal_mask = attention_mask[:, :, :, : key.shape[-2]] - attn_scores = attn_scores + causal_mask - - attn_weights = nn.functional.softmax(attn_scores, dim=-1) - attn_weights = attn_weights.to(value.dtype) - - # Mask heads if we want to - if head_mask is not None: - attn_weights = attn_weights * head_mask - - attn_weights = self.attention_dropout(attn_weights) - - attn_output = torch.matmul(attn_weights, value) - return attn_output, attn_weights + def _fa_peft_dtype_check(self, value): + """ + PEFT can silently cast the dtype to float32 - this method returns the target dtype to which + FA should convert back to (if necessary). For now, we can not move this to the forward pass + itself due to the dependency on checking on some part of its own weights (last case). + """ + target_dtype = None + if self.config._attn_implementation == "flash_attention_2": + input_dtype = value.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.query_key_value.weight.dtype + return target_dtype +# TODO Remove in deprecation cycle class GPTNeoXFlashAttention2(GPTNeoXAttention): - """ - GPTNeoX flash attention module. This module inherits from `GPTNeoXAttention` as the weights of the module stays - untouched. The only required change would be on the forward pass where it needs to correctly call the public API of - flash attention and deal with padding tokens in case the input contains any of them. - """ - - # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. - # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. - # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). - self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() - - def forward( - self, - hidden_states: torch.FloatTensor, - attention_mask: torch.FloatTensor, - position_ids: torch.LongTensor, - head_mask: Optional[torch.FloatTensor] = None, - layer_past: Optional[Cache] = None, - use_cache: Optional[bool] = False, - output_attentions: Optional[bool] = False, - cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 - ): - # Apply attention-specific projections and rope - query, key, value, present = self._attn_projections_and_rope( - hidden_states=hidden_states, - position_ids=position_ids, - layer_past=layer_past, - use_cache=use_cache, - cache_position=cache_position, - position_embeddings=position_embeddings, - ) - - query_length = query.shape[-2] - - # GPT-neo-X casts query and key in fp32 to apply rotary embedding in full precision - target_dtype = value.dtype - if query.dtype != target_dtype: - query = query.to(target_dtype) - if key.dtype != target_dtype: - key = key.to(target_dtype) - - # Permute to get the expected shape for Flash Attention - query = query.permute(0, 2, 1, 3) - key = key.permute(0, 2, 1, 3) - value = value.permute(0, 2, 1, 3) - - # In PEFT, usually we cast the layer norms in float32 for training stability reasons - # therefore the input hidden states gets silently casted in float32. Hence, we need - # cast them back in float16 / bfloat16 just to be sure everything works as expected. - # This might slowdown training & inference so it is recommended to not cast the LayerNorms - input_dtype = query.dtype - if input_dtype == torch.float32: - if torch.is_autocast_enabled(): - target_dtype = torch.get_autocast_gpu_dtype() - # Handle the case where the model is quantized - elif hasattr(self.config, "_pre_quantization_dtype"): - target_dtype = self.config._pre_quantization_dtype - else: - target_dtype = self.query_key_value.weight.dtype - - logger.warning_once( - f"The input hidden states seems to be silently casted in float32, this might be related to" - f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" - f" {target_dtype}." - ) - - query = query.to(target_dtype) - key = key.to(target_dtype) - value = value.to(target_dtype) - - attention_dropout = self.config.attention_dropout if self.training else 0.0 - - # Compute attention - attn_weights = _flash_attention_forward( - query, - key, - value, - attention_mask, - query_length, - dropout=attention_dropout, - softmax_scale=self.norm_factor, - is_causal=self.is_causal, - use_top_left_mask=self._flash_attn_uses_top_left_mask, - ) - - # Reshape outputs - attn_output = attn_weights.reshape( - attn_weights.shape[0], attn_weights.shape[1], self.num_attention_heads * self.head_size + logger.warning_once( + "The `GPTNeoXFlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`" + "attribute of the `GPTNeoXAttention` class! It will be removed in v4.48" ) - attn_output = self.dense(attn_output) - - outputs = (attn_output, layer_past) - if output_attentions: - outputs += (attn_weights,) - - return outputs +# TODO Remove in deprecation cycle class GPTNeoXSdpaAttention(GPTNeoXAttention): - """ - GPTNeoX attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from - `GPTNeoXAttention` as the weights of the module stays untouched. The only changes are on the forward pass - to adapt to the SDPA API. - """ - def __init__(self, config, layer_idx=None): super().__init__(config, layer_idx=layer_idx) - # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom - # attn_mask, so we need to call `.contiguous()`. This was fixed in torch==2.2.0. - # Reference: https://github.com/pytorch/pytorch/issues/112577 - self.require_contiguous_qkv = version.parse(get_torch_version()) < version.parse("2.2.0") - - def forward( - self, - hidden_states: torch.FloatTensor, - attention_mask: torch.FloatTensor, - position_ids: torch.LongTensor, - head_mask: Optional[torch.FloatTensor] = None, - layer_past: Optional[Tuple[torch.Tensor]] = None, - use_cache: Optional[bool] = False, - output_attentions: Optional[bool] = False, - cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 - ): - if output_attentions or head_mask is not None: - logger.warning_once( - "`GPTNeoXSdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support " - "`output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but " - "specifying the manual implementation will be required from Transformers version v5.0.0 onwards. " - 'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - head_mask=head_mask, - layer_past=layer_past, - use_cache=use_cache, - output_attentions=output_attentions, - cache_position=cache_position, - ) - - bsz, q_len, _ = hidden_states.size() - - # Apply attention-specific projections and rope - query, key, value, present = self._attn_projections_and_rope( - hidden_states=hidden_states, - position_ids=position_ids, - layer_past=layer_past, - use_cache=use_cache, - cache_position=cache_position, - position_embeddings=position_embeddings, - ) - - causal_mask = attention_mask - if attention_mask is not None: - causal_mask = causal_mask[:, :, :, : key.shape[-2]] - - # GPT-neo-X casts query and key in fp32 to apply rotary embedding in full precision - target_dtype = value.dtype - if query.dtype != target_dtype: - query = query.to(target_dtype) - if key.dtype != target_dtype: - key = key.to(target_dtype) - - # Avoid torch==2.1.2 specific bug for the memory-efficient backend in SDPA - if self.require_contiguous_qkv and query.device.type == "cuda" and attention_mask is not None: - query = query.contiguous() - key = key.contiguous() - value = value.contiguous() - - # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment - # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. - is_causal = True if causal_mask is None and q_len > 1 else False - - attn_output = torch.nn.functional.scaled_dot_product_attention( - query=query, - key=key, - value=value, - attn_mask=causal_mask, - dropout_p=self.attention_dropout.p if self.training else 0.0, - is_causal=is_causal, + logger.warning_once( + "The `GPTNeoXSdpaAttention` class is deprecated in favor of simply modifying the `config._attn_implementation`" + "attribute of the `GPTNeoXAttention` class! It will be removed in v4.48" ) - # Reshape outputs - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.view(bsz, q_len, self.hidden_size) - - attn_output = self.dense(attn_output) - - return attn_output, present, None - - -def attention_mask_func(attention_scores, ltor_mask): - attention_scores.masked_fill_(~ltor_mask, torch.finfo(attention_scores.dtype).min) - return attention_scores - # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->GPTNeoX class GPTNeoXRotaryEmbedding(nn.Module): @@ -675,6 +663,7 @@ def forward(self, hidden_states): "eager": GPTNeoXAttention, "flash_attention_2": GPTNeoXFlashAttention2, "sdpa": GPTNeoXSdpaAttention, + "flex_attention": GPTNeoXAttention, } @@ -919,7 +908,13 @@ def forward( # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] - head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + converted_head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + # Flex Attention converts it to a separate mask + if head_mask is not None: + converted_head_mask = ~converted_head_mask.bool() * torch.finfo(inputs_embeds.dtype).min + converted_head_mask = converted_head_mask.to(dtype=self.dtype, device=self.device) + head_mask = converted_head_mask + hidden_states = self.emb_dropout(inputs_embeds) # create position embeddings to be shared across the decoder layers diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py index 50c5b538af306c..9cabd48a51021f 100644 --- a/src/transformers/models/granite/modeling_granite.py +++ b/src/transformers/models/granite/modeling_granite.py @@ -909,7 +909,7 @@ def _update_causal_mask( min_dtype = torch.finfo(dtype).min sequence_length = input_tensor.shape[1] if using_static_cache: - target_length = past_key_values.get_max_length() + target_length = past_key_values.get_max_cache_shape() else: target_length = ( attention_mask.shape[-1] diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py index 07b42822621a3e..4871fc3584faee 100644 --- a/src/transformers/models/granitemoe/modeling_granitemoe.py +++ b/src/transformers/models/granitemoe/modeling_granitemoe.py @@ -1142,7 +1142,7 @@ def _update_causal_mask( min_dtype = torch.finfo(dtype).min sequence_length = input_tensor.shape[1] if using_static_cache: - target_length = past_key_values.get_max_length() + target_length = past_key_values.get_max_cache_shape() else: target_length = ( attention_mask.shape[-1] diff --git a/src/transformers/models/idefics3/__init__.py b/src/transformers/models/idefics3/__init__.py index 35b1df5c678439..cec07ca6f5e2d3 100644 --- a/src/transformers/models/idefics3/__init__.py +++ b/src/transformers/models/idefics3/__init__.py @@ -16,7 +16,7 @@ from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available -_import_structure = {"configuration_idefics3": ["Idefics3Config"]} +_import_structure = {"configuration_idefics3": ["Idefics3Config", "Idefics3VisionConfig"]} try: @@ -38,11 +38,12 @@ "Idefics3ForConditionalGeneration", "Idefics3PreTrainedModel", "Idefics3Model", + "Idefics3VisionTransformer", ] _import_structure["processing_idefics3"] = ["Idefics3Processor"] if TYPE_CHECKING: - from .configuration_idefics3 import Idefics3Config + from .configuration_idefics3 import Idefics3Config, Idefics3VisionConfig try: if not is_vision_available(): @@ -62,6 +63,7 @@ Idefics3ForConditionalGeneration, Idefics3Model, Idefics3PreTrainedModel, + Idefics3VisionTransformer, ) from .processing_idefics3 import Idefics3Processor diff --git a/src/transformers/models/idefics3/image_processing_idefics3.py b/src/transformers/models/idefics3/image_processing_idefics3.py index 05a1a396dc72d3..f9161416656cd7 100644 --- a/src/transformers/models/idefics3/image_processing_idefics3.py +++ b/src/transformers/models/idefics3/image_processing_idefics3.py @@ -38,6 +38,7 @@ logger = logging.get_logger(__name__) +MAX_IMAGE_SIZE = 4096 # 4k resolution as absolute maximum if is_vision_available(): @@ -116,7 +117,6 @@ def _resize_output_size_scale_below_upper_bound( def get_resize_output_image_size( image, resolution_max_side: int, - max_image_size: int = 1820, input_data_format: Optional[Union[str, ChannelDimension]] = None, ) -> Tuple[int, int]: """ @@ -126,24 +126,18 @@ def get_resize_output_image_size( Image to resize. resolution_max_side (`int`): The longest edge of the image will be resized to this value. The shortest edge will be resized to keep the - input aspect ratio, with a lower bound of `min_image_size`. - max_image_size (`int`, *optional*, defaults to 1820): - Maximum image resolution. If the image is larger than this size, the longest edge will be resized to this - value, with the shortest edge resized to keep the input aspect ratio, with a lower bound of `min_image_size`. + input aspect ratio. input_data_format (`ChannelDimension` or `str`): The channel dimension format of the input image. Returns: The output size of the image after resizing. """ - if resolution_max_side > max_image_size: - raise ValueError("`resolution_max_side` cannot be larger than `max_image_size`") - height, width = get_image_size(image, channel_dim=input_data_format) # Find the output size, when rescaling the longest edge to max_len and preserving the aspect ratio height, width = _resize_output_size_rescale_to_max_len(height, width, max_len=resolution_max_side) - # Find the output size when scaling the image to be below the max_image_size - height, width = _resize_output_size_scale_below_upper_bound(height, width, max_len=max_image_size) + # Find the output size when scaling the image to be below the MAX_IMAGE_SIZE + height, width = _resize_output_size_scale_below_upper_bound(height, width, max_len=MAX_IMAGE_SIZE) return height, width @@ -251,7 +245,7 @@ def convert_to_rgb( data_format = input_data_format if data_format is None else data_format mode = "P" if palette is not None else None - image = to_pil_image(image, image_mode=mode) + image = to_pil_image(image, image_mode=mode, input_data_format=input_data_format) if image.mode == "P" and palette is not None: image.putpalette(palette) @@ -404,7 +398,7 @@ def resize( image_mode = None if image.ndim == 2 or image.shape[-1] == 1: image_mode = "P" - image = to_pil_image(image, image_mode=image_mode) + image = to_pil_image(image, image_mode=image_mode, input_data_format=input_data_format) resized_image = image.resize((size[1], size[0]), resample=resample) resized_image = np.array(resized_image) @@ -754,6 +748,16 @@ def preprocess( # All transformations expect numpy arrays. images_list = [[to_numpy_array(image) for image in images] for images in images_list] + # Extra channel dimension for grayscale images + if input_data_format in [ChannelDimension.LAST, None]: + images_list = [ + [np.expand_dims(img, axis=-1) if img.ndim == 2 else img for img in images] for images in images_list + ] + elif input_data_format == ChannelDimension.FIRST: + images_list = [ + [np.expand_dims(img, axis=0) if img.ndim == 2 else img for img in images] for images in images_list + ] + if is_scaled_image(images_list[0][0]) and do_rescale: logger.warning_once( "It looks like you are trying to rescale already rescaled images. If the input" @@ -764,18 +768,6 @@ def preprocess( if input_data_format is None: input_data_format = infer_channel_dimension_format(images_list[0][0], num_channels=(1, 3, 4)) - # Extra channel dimension for grayscale images - if input_data_format == ChannelDimension.LAST: - images_list = [ - [np.expand_dims(img, axis=-1) if img.ndim == 2 else img for img in images] for images in images_list - ] - elif input_data_format == ChannelDimension.FIRST: - images_list = [ - [np.expand_dims(img, axis=0) if img.ndim == 2 else img for img in images] for images in images_list - ] - else: - raise ValueError(f"Invalid channel dimension format {input_data_format}.") - if do_resize: images_list = [ [ diff --git a/src/transformers/models/ijepa/__init__.py b/src/transformers/models/ijepa/__init__.py new file mode 100644 index 00000000000000..efc8c90b17628d --- /dev/null +++ b/src/transformers/models/ijepa/__init__.py @@ -0,0 +1,55 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_torch_available, +) + + +_import_structure = {"configuration_ijepa": ["IJepaConfig"]} + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_ijepa"] = [ + "IJepaForImageClassification", + "IJepaModel", + "IJepaPreTrainedModel", + ] + +if TYPE_CHECKING: + from .configuration_ijepa import IJepaConfig + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_ijepa import ( + IJepaForImageClassification, + IJepaModel, + IJepaPreTrainedModel, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/ijepa/configuration_ijepa.py b/src/transformers/models/ijepa/configuration_ijepa.py new file mode 100644 index 00000000000000..26378e6e81d9ce --- /dev/null +++ b/src/transformers/models/ijepa/configuration_ijepa.py @@ -0,0 +1,108 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""I-JEPA model configuration""" + +from ...configuration_utils import PretrainedConfig + + +class IJepaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`IJepaModel`]. It is used to instantiate an IJEPA + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the I-JEPA + [google/ijepa-base-patch16-224](https://huggingface.co/google/ijepa-base-patch16-224) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + + Example: + + ```python + >>> from transformers import IJepaConfig, IJepaModel + + >>> # Initializing a IJEPA ijepa-base-patch16-224 style configuration + >>> configuration = IJepaConfig() + + >>> # Initializing a model (with random weights) from the ijepa-base-patch16-224 style configuration + >>> model = IJepaModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "ijepa" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-12, + image_size=224, + patch_size=16, + num_channels=3, + qkv_bias=True, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.qkv_bias = qkv_bias diff --git a/src/transformers/models/ijepa/convert_ijepa_to_hf.py b/src/transformers/models/ijepa/convert_ijepa_to_hf.py new file mode 100644 index 00000000000000..5c15a72ff88847 --- /dev/null +++ b/src/transformers/models/ijepa/convert_ijepa_to_hf.py @@ -0,0 +1,267 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert IJEPA checkpoints from the original repository. + +URL: https://github.com/facebookresearch/ijepa +""" + +import argparse +import gc +import re +from pathlib import Path + +import requests +import torch +from PIL import Image + +from transformers import ( + IJepaConfig, + IJepaModel, + ViTImageProcessor, +) +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + +# fmt: off +ORIGINAL_TO_CONVERTED_KEY_MAPPING = { + # Projection layer + position embeddings + r"pos_embed": r"embeddings.position_embeddings", + r"patch_embed.proj.weight": r"embeddings.patch_embeddings.projection.weight", + r"patch_embed.proj.bias": r"embeddings.patch_embeddings.projection.bias", + + # Encoder layers: Layernorms, Attention, Feedforward layers + r"blocks.(\d+).norm1.weight": r"encoder.layer.\1.layernorm_before.weight", + r"blocks.(\d+).norm1.bias": r"encoder.layer.\1.layernorm_before.bias", + r"blocks.(\d+).attn.proj.weight": r"encoder.layer.\1.attention.output.dense.weight", + r"blocks.(\d+).attn.proj.bias": r"encoder.layer.\1.attention.output.dense.bias", + r"blocks.(\d+).norm2.weight": r"encoder.layer.\1.layernorm_after.weight", + r"blocks.(\d+).norm2.bias": r"encoder.layer.\1.layernorm_after.bias", + r"blocks.(\d+).mlp.fc1.weight": r"encoder.layer.\1.intermediate.dense.weight", + r"blocks.(\d+).mlp.fc1.bias": r"encoder.layer.\1.intermediate.dense.bias", + r"blocks.(\d+).mlp.fc2.weight": r"encoder.layer.\1.output.dense.weight", + r"blocks.(\d+).mlp.fc2.bias": r"encoder.layer.\1.output.dense.bias", + + # Layernorm + pooler + r"norm.weight": r"layernorm.weight", + r"norm.bias": r"layernorm.bias", +} +# fmt: on + + +def convert_old_keys_to_new_keys(state_dict_keys: dict = None): + """ + Converts old keys to new keys using the mapping and dynamically removes the 'ijepa.' prefix if necessary. + + Args: + state_dict_keys (dict): The keys from the state_dict to convert. + + Returns: + dict: A mapping from old keys to new keys. + """ + output_dict = {} + if state_dict_keys is not None: + old_text = "\n".join(state_dict_keys) + new_text = old_text + + # Apply regex-based mapping + for pattern, replacement in ORIGINAL_TO_CONVERTED_KEY_MAPPING.items(): + if replacement is None: + new_text = re.sub(pattern, "", new_text) # Skip the key + continue + new_text = re.sub(pattern, replacement, new_text) + + output_dict = dict(zip(old_text.split("\n"), new_text.split("\n"))) + + return output_dict + + +# we split up the matrix of each encoder layer into queries, keys and values +def read_in_q_k_v(state_dict, config): + for i in range(config.num_hidden_layers): + # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) + in_proj_weight = state_dict.pop(f"blocks.{i}.attn.qkv.weight") + in_proj_bias = state_dict.pop(f"blocks.{i}.attn.qkv.bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[: config.hidden_size, :] + state_dict[f"encoder.layer.{i}.attention.attention.query.bias"] = in_proj_bias[: config.hidden_size] + state_dict[f"encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ + config.hidden_size : config.hidden_size * 2, : + ] + state_dict[f"encoder.layer.{i}.attention.attention.key.bias"] = in_proj_bias[ + config.hidden_size : config.hidden_size * 2 + ] + state_dict[f"encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-config.hidden_size :, :] + state_dict[f"encoder.layer.{i}.attention.attention.value.bias"] = in_proj_bias[-config.hidden_size :] + + +def rename_key(dct, old, new): + val = dct.pop(old) + dct[new] = val + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + im = Image.open(requests.get(url, stream=True).raw) + return im + + +def get_ijepa_config(model_name): + patch_size = int(model_name.split("_")[1][4:]) + config = IJepaConfig(patch_size=patch_size) + if "vith" in model_name: + config.hidden_size = 1280 + config.num_hidden_layers = 32 + config.num_attention_heads = 16 + config.layer_norm_eps = 1e-6 + config.mlp_ratio = 4 + config.intermediate_size = 5120 + if model_name == "ijepa_vith16_1k": + config.image_size = 448 + elif "vitg" in model_name: + config.hidden_size = 1408 + config.num_hidden_layers = 40 + config.num_attention_heads = 16 + config.layer_norm_eps = 1e-6 + config.mlp_ratio = 48 / 11 + config.intermediate_size = 6144 + else: + raise ValueError("Model not supported, only supports huge and giant models.") + return config + + +@torch.no_grad() +def write_model(model_name, output_dir, safe_serialization, push_to_hub, verify_logits): + """ + Copy/paste/tweak model's weights to our IJEPA structure. + """ + + # define default IJEPA configuration + config = get_ijepa_config(model_name) + + checkpoint_mapping = { + "ijepa_vith14_1k": "https://dl.fbaipublicfiles.com/ijepa/IN1K-vit.h.14-300e.pth.tar", + "ijepa_vith14_22k": "https://dl.fbaipublicfiles.com/ijepa/IN22K-vit.h.14-900e.pth.tar", + "ijepa_vith16_1k": "https://dl.fbaipublicfiles.com/ijepa/IN1K-vit.h.16-448px-300e.pth.tar", + "ijepa_vitg16_22k": "https://dl.fbaipublicfiles.com/ijepa/IN22K-vit.g.16-600e.pth.tar", + } + + # Load original checkpoint + checkpoint_url = checkpoint_mapping[model_name] + original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["encoder"] + original_state_dict = {k.replace("module.", ""): v for k, v in original_state_dict.items()} + + # Rename keys + state_dict = original_state_dict.copy() + new_keys = convert_old_keys_to_new_keys(state_dict.keys()) + for old_key, new_key in new_keys.items(): + rename_key(state_dict, old_key, new_key) + read_in_q_k_v(state_dict, config) + + # load HuggingFace model + model = IJepaModel(config, add_pooling_layer=False).eval() + model.load_state_dict(state_dict) + size = {"height": config.image_size, "width": config.image_size} + image_processor = ViTImageProcessor(size=size) + + if verify_logits: + # Check outputs on an image, prepared by ViTImageProcessor + encoding = image_processor(images=prepare_img(), return_tensors="pt") + pixel_values = encoding["pixel_values"] + with torch.no_grad(): + outputs = model(pixel_values) + + expected_slices = { + "ijepa_vith14_1k": torch.Tensor( + [[-0.0621, -0.0054, -2.7513], [-0.1952, 0.0909, -3.9536], [0.0942, -0.0331, -1.2833]] + ), + "ijepa_vith14_22k": torch.Tensor( + [[0.0358, -0.0045, -0.2154], [0.0418, -0.0246, 0.0108], [0.2529, -0.0345, -0.0246]] + ), + "ijepa_vith16_1k": torch.Tensor( + [[0.5145, -0.1259, 0.0615], [0.1132, 0.0028, -0.0496], [1.1586, -0.0056, -0.0387]] + ), + "ijepa_vitg16_22k": torch.Tensor( + [[0.0512, -0.0510, -0.0649], [0.1972, 0.0380, -0.0790], [0.1667, -0.0834, -0.1240]] + ), + } + + assert torch.allclose( + expected_slices[model_name], + outputs.last_hidden_state[0, :3, :3], + atol=1e-4, + ) + + if output_dir: + Path(output_dir).mkdir(exist_ok=True) + print(f"Saving model {model_name} to {output_dir}") + image_processor.save_pretrained(output_dir, safe_serialization=safe_serialization) + model.save_pretrained(output_dir, safe_serialization=safe_serialization) + + if push_to_hub: + image_processor.push_to_hub(repo_id=f"jmtzt/{model_name}", safe_serialization=safe_serialization) + model.push_to_hub(repo_id=f"jmtzt/{model_name}", safe_serialization=safe_serialization) + + if output_dir: + del model, state_dict + gc.collect() + print("Reloading the model to check if it's saved correctly.") + IJepaModel.from_pretrained(output_dir, device_map="auto") + print("Model reloaded successfully.") + + +def main(): + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--model_name", + default="ijepa_vith14_1k", + type=str, + choices=[ + "ijepa_vith14_1k", + "ijepa_vith14_22k", + "ijepa_vith16_1k", + "ijepa_vitg16_22k", + ], + help="Name of the model you'd like to convert.", + ) + parser.add_argument( + "--output_dir", + default=None, + type=str, + help="Path to the output PyTorch model directory.", + ) + parser.add_argument( + "--safe_serialization", default=True, type=bool, help="Whether or not to save using `safetensors`." + ) + parser.add_argument( + "--push_to_hub", + action="store_true", + help="Whether or not to push the model to the 🤗 Hub.", + ) + parser.add_argument( + "--verify_logits", action="store_false", help="Whether or not to verify logits after conversion." + ) + + parser.set_defaults() + args = parser.parse_args() + write_model(args.model_name, args.output_dir, args.safe_serialization, args.push_to_hub, args.verify_logits) + + +if __name__ == "__main__": + main() diff --git a/src/transformers/models/ijepa/modeling_ijepa.py b/src/transformers/models/ijepa/modeling_ijepa.py new file mode 100644 index 00000000000000..df254455bad5ab --- /dev/null +++ b/src/transformers/models/ijepa/modeling_ijepa.py @@ -0,0 +1,751 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/ijepa/modular_ijepa.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_ijepa.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +import collections.abc +import math +from typing import Dict, List, Optional, Set, Tuple, Union + +import torch +import torch.nn as nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + torch_int, +) +from .configuration_ijepa import IJepaConfig + + +logger = logging.get_logger(__name__) + + +_CHECKPOINT_FOR_DOC = "facebook/ijepa_vith14_1k" + +# General docstring +_CONFIG_FOR_DOC = "IJepaConfig" + + +class IJepaPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.hidden_size + + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor: + batch_size, num_channels, height, width = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + f" Expected {self.num_channels} but got {num_channels}." + ) + if not interpolate_pos_encoding: + if height != self.image_size[0] or width != self.image_size[1]: + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model" + f" ({self.image_size[0]}*{self.image_size[1]})." + ) + embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2) + return embeddings + + +class IJepaEmbeddings(nn.Module): + """ + Construct the CLS token, position and patch embeddings. Optionally, also the mask token. + """ + + def __init__(self, config: IJepaConfig, use_mask_token: bool = False) -> None: + super().__init__() + self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None + self.patch_embeddings = IJepaPatchEmbeddings(config) + num_patches = self.patch_embeddings.num_patches + self.position_embeddings = nn.Parameter(torch.randn(1, num_patches, config.hidden_size)) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.patch_size = config.patch_size + self.config = config + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] + num_positions = self.position_embeddings.shape[1] + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + patch_pos_embed = self.position_embeddings + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return patch_pos_embed + + def forward( + self, + pixel_values: torch.Tensor, + bool_masked_pos: Optional[torch.BoolTensor] = None, + interpolate_pos_encoding: bool = False, + ) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + + if bool_masked_pos is not None: + seq_length = embeddings.shape[1] + mask_tokens = self.mask_token.expand(batch_size, seq_length, -1) + # replace the masked visual tokens by mask_tokens + mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens) + embeddings = embeddings * (1.0 - mask) + mask_tokens * mask + + # add positional encoding to each token + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embeddings + + embeddings = self.dropout(embeddings) + + return embeddings + + +class IJepaPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = IJepaConfig + base_model_prefix = "ijepa" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["IJepaEmbeddings", "IJepaLayer"] + _supports_sdpa = True + + def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid + # `trunc_normal_cpu` not implemented in `half` issues + module.weight.data = nn.init.trunc_normal_( + module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range + ).to(module.weight.dtype) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, IJepaEmbeddings): + module.position_embeddings.data = nn.init.trunc_normal_( + module.position_embeddings.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.position_embeddings.dtype) + + +class IJepaSelfAttention(nn.Module): + def __init__(self, config: IJepaConfig) -> None: + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size {config.hidden_size,} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + mixed_query_layer = self.query(hidden_states) + + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(mixed_query_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class IJepaSdpaSelfAttention(IJepaSelfAttention): + def __init__(self, config: IJepaConfig) -> None: + super().__init__(config) + self.attention_probs_dropout_prob = config.attention_probs_dropout_prob + + def forward( + self, + hidden_states: torch.FloatTensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + if output_attentions or head_mask is not None: + logger.warning_once( + "`IJepaSdpaAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support " + "`output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but " + "specifying the manual implementation will be required from Transformers version v5.0.0 onwards. " + 'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + head_mask=head_mask, + output_attentions=output_attentions, + ) + + mixed_query_layer = self.query(hidden_states) + + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(mixed_query_layer) + + context_layer = torch.nn.functional.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + head_mask, + self.attention_probs_dropout_prob if self.training else 0.0, + is_causal=False, + scale=None, + ) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + return context_layer, None + + +class IJepaSelfOutput(nn.Module): + """ + The residual connection is defined in IJepaLayer instead of here (as is the case with other models), due to the + layernorm applied before each block. + """ + + def __init__(self, config: IJepaConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +class IJepaAttention(nn.Module): + def __init__(self, config: IJepaConfig) -> None: + super().__init__() + self.attention = IJepaSelfAttention(config) + self.output = IJepaSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads: Set[int]) -> None: + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + self_outputs = self.attention(hidden_states, head_mask, output_attentions) + + attention_output = self.output(self_outputs[0], hidden_states) + + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class IJepaSdpaAttention(IJepaAttention): + def __init__(self, config: IJepaConfig) -> None: + super().__init__(config) + self.attention = IJepaSdpaSelfAttention(config) + + +class IJepaIntermediate(nn.Module): + def __init__(self, config: IJepaConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + +class IJepaOutput(nn.Module): + def __init__(self, config: IJepaConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + hidden_states = hidden_states + input_tensor + + return hidden_states + + +IJEPA_ATTENTION_CLASSES = { + "eager": IJepaAttention, + "sdpa": IJepaSdpaAttention, +} + + +class IJepaLayer(nn.Module): + """This corresponds to the Block class in the timm implementation.""" + + def __init__(self, config: IJepaConfig) -> None: + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = IJEPA_ATTENTION_CLASSES[config._attn_implementation](config) + self.intermediate = IJepaIntermediate(config) + self.output = IJepaOutput(config) + self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + self_attention_outputs = self.attention( + self.layernorm_before(hidden_states), # in IJepa, layernorm is applied before self-attention + head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + # first residual connection + hidden_states = attention_output + hidden_states + + # in IJepa, layernorm is also applied after self-attention + layer_output = self.layernorm_after(hidden_states) + layer_output = self.intermediate(layer_output) + + # second residual connection is done here + layer_output = self.output(layer_output, hidden_states) + + outputs = (layer_output,) + outputs + + return outputs + + +class IJepaEncoder(nn.Module): + def __init__(self, config: IJepaConfig) -> None: + super().__init__() + self.config = config + self.layer = nn.ModuleList([IJepaLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + layer_head_mask, + output_attentions, + ) + else: + layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class IJepaPooler(nn.Module): + def __init__(self, config: IJepaConfig): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +IJEPA_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`IJepaImageProcessor.__call__`] + for details. + + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + interpolate_pos_encoding (`bool`, *optional*): + Whether to interpolate the pre-trained position encodings. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" +_EXPECTED_OUTPUT_SHAPE = [1, 197, 768] + + +IJEPA_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`IJepaConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare IJepa Model transformer outputting raw hidden-states without any specific head on top.", + IJEPA_START_DOCSTRING, +) +class IJepaModel(IJepaPreTrainedModel): + def __init__(self, config: IJepaConfig, add_pooling_layer: bool = False, use_mask_token: bool = False): + super().__init__(config) + self.config = config + self.embeddings = IJepaEmbeddings(config, use_mask_token=use_mask_token) + self.encoder = IJepaEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.pooler = IJepaPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> IJepaPatchEmbeddings: + return self.embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None: + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(IJEPA_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + bool_masked_pos: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + r""" + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + # TODO: maybe have a cleaner way to cast the input (from `ImageProcessor` side?) + expected_dtype = self.embeddings.patch_embeddings.projection.weight.dtype + if pixel_values.dtype != expected_dtype: + pixel_values = pixel_values.to(expected_dtype) + + embedding_output = self.embeddings( + pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding + ) + + encoder_outputs = self.encoder( + embedding_output, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,) + return head_outputs + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "google/ijepa-base-patch16-224" +_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat" + + +@add_start_docstrings( + """ + IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states) + e.g. for ImageNet. + + + + Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by + setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained + position embeddings to the higher resolution. + + + """, + IJEPA_START_DOCSTRING, +) +class IJepaForImageClassification(IJepaPreTrainedModel): + def __init__(self, config: IJepaConfig) -> None: + super().__init__(config) + + self.num_labels = config.num_labels + self.ijepa = IJepaModel(config, add_pooling_layer=False) + + # Classifier head + self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(IJEPA_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=ImageClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, ImageClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.ijepa( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.classifier(sequence_output.mean(dim=1)) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return ImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = ["IJepaPreTrainedModel", "IJepaModel", "IJepaForImageClassification"] diff --git a/src/transformers/models/ijepa/modular_ijepa.py b/src/transformers/models/ijepa/modular_ijepa.py new file mode 100644 index 00000000000000..3b3756dd5ce697 --- /dev/null +++ b/src/transformers/models/ijepa/modular_ijepa.py @@ -0,0 +1,255 @@ +from typing import Optional, Union + +import torch +import torch.nn as nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.models.ijepa.configuration_ijepa import IJepaConfig + +from ...modeling_outputs import ImageClassifierOutput +from ...modeling_utils import PreTrainedModel +from ...utils import ( + add_start_docstrings, + torch_int, +) +from ..vit.modeling_vit import ( + ViTEmbeddings, + ViTForImageClassification, + ViTModel, +) + + +_CHECKPOINT_FOR_DOC = "facebook/ijepa_vith14_1k" + + +class IJepaEmbeddings(ViTEmbeddings): + def __init__(self, config: IJepaConfig, use_mask_token: bool = False) -> None: + super().__init__(config, use_mask_token) + # Remove cls_token from IJepaEmbeddings, as it is not used in the model + del self.cls_token + num_patches = self.patch_embeddings.num_patches + self.position_embeddings = nn.Parameter(torch.randn(1, num_patches, config.hidden_size)) + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] + num_positions = self.position_embeddings.shape[1] + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + patch_pos_embed = self.position_embeddings + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return patch_pos_embed + + def forward( + self, + pixel_values: torch.Tensor, + bool_masked_pos: Optional[torch.BoolTensor] = None, + interpolate_pos_encoding: bool = False, + ) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + + if bool_masked_pos is not None: + seq_length = embeddings.shape[1] + mask_tokens = self.mask_token.expand(batch_size, seq_length, -1) + # replace the masked visual tokens by mask_tokens + mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens) + embeddings = embeddings * (1.0 - mask) + mask_tokens * mask + + # add positional encoding to each token + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embeddings + + embeddings = self.dropout(embeddings) + + return embeddings + + +class IJepaPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = IJepaConfig + base_model_prefix = "ijepa" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["IJepaEmbeddings", "IJepaLayer"] + _supports_sdpa = True + + def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid + # `trunc_normal_cpu` not implemented in `half` issues + module.weight.data = nn.init.trunc_normal_( + module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range + ).to(module.weight.dtype) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, IJepaEmbeddings): + module.position_embeddings.data = nn.init.trunc_normal_( + module.position_embeddings.data.to(torch.float32), + mean=0.0, + std=self.config.initializer_range, + ).to(module.position_embeddings.dtype) + + +_EXPECTED_OUTPUT_SHAPE = [1, 256, 1280] + +IJEPA_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`IJepaConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare IJepa Model transformer outputting raw hidden-states without any specific head on top.", + IJEPA_START_DOCSTRING, +) +class IJepaModel(IJepaPreTrainedModel, ViTModel): + def __init__(self, config: IJepaConfig, add_pooling_layer: bool = False, use_mask_token: bool = False): + super().__init__(config) + self.config = config + self.embeddings = IJepaEmbeddings(config, use_mask_token=use_mask_token) + + +_IMAGE_CLASS_CHECKPOINT = "facebook/ijepa_vith14_1k" +_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat" + + +@add_start_docstrings( + """ + IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states) + e.g. for ImageNet. + + + + Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by + setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained + position embeddings to the higher resolution. + + + """, + IJEPA_START_DOCSTRING, +) +class IJepaForImageClassification(IJepaPreTrainedModel, ViTForImageClassification): + def __init__(self, config: IJepaConfig): + super().__init__(config) + self.ijepa = IJepaModel(config, add_pooling_layer=False) + self.post_init() + + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, ImageClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.ijepa( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.classifier(sequence_output.mean(dim=1)) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return ImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "IJepaPreTrainedModel", + "IJepaModel", + "IJepaForImageClassification", +] diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index e5622185bc39a8..acce24cc42f5d8 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -1591,11 +1591,12 @@ def generate( ) if input_ids is None: - input_ids = ( - torch.LongTensor([[self.config.text_config.bos_token_id]]) - .repeat(batch_size, 1) - .to(image_embeds.device) - ) + start_tokens = [self.config.text_config.bos_token_id] + if getattr(self.config, "image_token_index", None) is not None: + start_tokens = [self.config.image_token_index] * self.config.num_query_tokens + start_tokens + input_ids = torch.tensor([start_tokens], dtype=torch.long, device=image_embeds.device) + input_ids = input_ids.repeat(batch_size, 1) + if attention_mask is None: attention_mask = torch.ones_like(input_ids) diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py index b0a494dcfe6cec..e91b05bc015263 100644 --- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py @@ -1626,11 +1626,12 @@ def generate( ) if input_ids is None: - input_ids = ( - torch.LongTensor([[self.config.text_config.bos_token_id]]) - .repeat(batch_size, 1) - .to(image_embeds.device) - ) + start_tokens = [self.config.text_config.bos_token_id] + if getattr(self.config, "video_token_index", None) is not None: + start_tokens = [self.config.video_token_index] * self.config.num_query_tokens * 4 + start_tokens + input_ids = torch.tensor([start_tokens], dtype=torch.long, device=image_embeds.device) + input_ids = input_ids.repeat(batch_size, 1) + if attention_mask is None: attention_mask = torch.ones_like(input_ids) diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py index b0dc8a215740f1..7184955af3aa56 100644 --- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py @@ -439,11 +439,12 @@ def generate( ) if input_ids is None: - input_ids = ( - torch.LongTensor([[self.config.text_config.bos_token_id]]) - .repeat(batch_size, 1) - .to(image_embeds.device) - ) + start_tokens = [self.config.text_config.bos_token_id] + if getattr(self.config, "video_token_index", None) is not None: + start_tokens = [self.config.video_token_index] * self.config.num_query_tokens * 4 + start_tokens + input_ids = torch.tensor([start_tokens], dtype=torch.long, device=image_embeds.device) + input_ids = input_ids.repeat(batch_size, 1) + if attention_mask is None: attention_mask = torch.ones_like(input_ids) diff --git a/src/transformers/models/jamba/configuration_jamba.py b/src/transformers/models/jamba/configuration_jamba.py index b493db7ed456b3..3aabe979d8ef2f 100644 --- a/src/transformers/models/jamba/configuration_jamba.py +++ b/src/transformers/models/jamba/configuration_jamba.py @@ -114,7 +114,7 @@ class JambaConfig(PretrainedConfig): mamba_expand (`int`, *optional*, defaults to 2): Expanding factor (relative to hidden_size) used to determine the mamba intermediate size mamba_dt_rank (`Union[int,str]`, *optional*, defaults to `"auto"`): - Rank of the the mamba discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)` + Rank of the mamba discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)` mamba_conv_bias (`bool`, *optional*, defaults to `True`): Flag indicating whether or not to use bias in the convolution layer of the mamba mixer block. mamba_proj_bias (`bool`, *optional*, defaults to `False`): diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py index 32ae6ea02eba5b..a185d5ebc6e86c 100755 --- a/src/transformers/models/jamba/modeling_jamba.py +++ b/src/transformers/models/jamba/modeling_jamba.py @@ -852,7 +852,7 @@ class JambaSparseMoeBlock(nn.Module): This implementation is strictly equivalent to standard MoE with full capacity (no dropped tokens). It's faster since it formulates MoE operations - in terms of block-sparse operations to accomodate imbalanced + in terms of block-sparse operations to accommodate imbalanced assignments of tokens to experts, whereas standard MoE either (1) drop tokens at the cost of reduced performance or (2) set capacity factor to number of experts and thus waste computation diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index a3667e06534564..98d5ecdd2a4fdb 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -141,6 +141,16 @@ class LlamaConfig(PretrainedConfig): model_type = "llama" keys_to_ignore_at_inference = ["past_key_values"] + # Default tensor parallel plan for base model `LlamaModel` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } def __init__( self, diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 4d95f01849d678..0408bb73c7f2da 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -21,7 +21,6 @@ from typing import List, Optional, Tuple, Union import torch -import torch.nn.functional as F import torch.utils.checkpoint from torch import nn @@ -240,25 +239,7 @@ def __init__(self, config): self.act_fn = ACT2FN[config.hidden_act] def forward(self, x): - if self.config.pretraining_tp > 1: - slice = self.intermediate_size // self.config.pretraining_tp - gate_proj_slices = self.gate_proj.weight.split(slice, dim=0) - up_proj_slices = self.up_proj.weight.split(slice, dim=0) - down_proj_slices = self.down_proj.weight.split(slice, dim=1) - - gate_proj = torch.cat( - [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1 - ) - up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1) - - intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2) - down_proj = [ - F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp) - ] - down_proj = sum(down_proj) - else: - down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) - + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) return down_proj @@ -320,31 +301,14 @@ def forward( ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() - if self.config.pretraining_tp > 1: - key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp - query_slices = self.q_proj.weight.split( - (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0 - ) - key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) - value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) - - query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)] - query_states = torch.cat(query_states, dim=-1) - - key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)] - key_states = torch.cat(key_states, dim=-1) - - value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)] - value_states = torch.cat(value_states, dim=-1) - - else: - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used + query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) if position_embeddings is None: logger.warning_once( @@ -386,12 +350,7 @@ def forward( attn_output = attn_output.reshape(bsz, q_len, -1) - if self.config.pretraining_tp > 1: - attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) - o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1) - attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)]) - else: - attn_output = self.o_proj(attn_output) + attn_output = self.o_proj(attn_output) if not output_attentions: attn_weights = None @@ -564,9 +523,10 @@ def forward( key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used + query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) if position_embeddings is None: logger.warning_once( @@ -850,7 +810,10 @@ def __init__(self, config: LlamaConfig): ) self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.rotary_emb = LlamaRotaryEmbedding(config=config) + self.gradient_checkpointing = False + if getattr(config, "pretraining_tp", 1) != 1: + logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.") # Initialize weights and apply final processing self.post_init() @@ -930,7 +893,7 @@ def forward( all_self_attns = () if output_attentions else None next_decoder_cache = None - for decoder_layer in self.layers: + for decoder_layer in self.layers[: self.config.num_hidden_layers]: if output_hidden_states: all_hidden_states += (hidden_states,) @@ -1113,6 +1076,7 @@ class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ... class LlamaForCausalLM(LlamaPreTrainedModel, GenerationMixin): _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} def __init__(self, config): super().__init__(config) @@ -1211,13 +1175,8 @@ def forward( ) hidden_states = outputs[0] - if self.config.pretraining_tp > 1: - lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0) - logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)] - logits = torch.cat(logits, dim=-1) - else: - # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) loss = None if labels is not None: diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py index 820fa581711a63..08caa3d1d8a75a 100644 --- a/src/transformers/models/llava/processing_llava.py +++ b/src/transformers/models/llava/processing_llava.py @@ -58,10 +58,19 @@ class LlavaProcessor(ProcessorMixin): in a chat into a tokenizable string. image_token (`str`, *optional*, defaults to `""`): Special token used to denote image location. + num_additional_image_tokens (`int`, *optional*, defaults to 0): + Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other + extra tokens appended, no need to set this arg. """ attributes = ["image_processor", "tokenizer"] - valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token"] + valid_kwargs = [ + "chat_template", + "patch_size", + "vision_feature_select_strategy", + "image_token", + "num_additional_image_tokens", + ] image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" @@ -73,9 +82,11 @@ def __init__( vision_feature_select_strategy=None, chat_template=None, image_token="", # set the default and let users change if they have peculiar special tokens in rare cases + num_additional_image_tokens=0, **kwargs, ): self.patch_size = patch_size + self.num_additional_image_tokens = num_additional_image_tokens self.vision_feature_select_strategy = vision_feature_select_strategy self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token super().__init__(image_processor, tokenizer, chat_template=chat_template) @@ -147,9 +158,11 @@ def __call__( # Replace the image token with the expanded image token sequence pixel_values = image_inputs["pixel_values"] height, width = get_image_size(to_numpy_array(pixel_values[0])) - num_image_tokens = (height // self.patch_size) * (width // self.patch_size) + 1 + num_image_tokens = (height // self.patch_size) * ( + width // self.patch_size + ) + self.num_additional_image_tokens if self.vision_feature_select_strategy == "default": - num_image_tokens -= 1 + num_image_tokens -= self.num_additional_image_tokens prompt_strings = [] for sample in text: diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py index 89b885f0f1abb2..38173cbd861fc1 100644 --- a/src/transformers/models/llava_next/processing_llava_next.py +++ b/src/transformers/models/llava_next/processing_llava_next.py @@ -61,10 +61,19 @@ class LlavaNextProcessor(ProcessorMixin): in a chat into a tokenizable string. image_token (`str`, *optional*, defaults to `""`): Special token used to denote image location. + num_additional_image_tokens (`int`, *optional*, defaults to 0): + Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other + extra tokens appended, no need to set this arg. """ attributes = ["image_processor", "tokenizer"] - valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token"] + valid_kwargs = [ + "chat_template", + "patch_size", + "vision_feature_select_strategy", + "image_token", + "num_additional_image_tokens", + ] image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" @@ -76,9 +85,11 @@ def __init__( vision_feature_select_strategy=None, chat_template=None, image_token="", # set the default and let users change if they have peculiar special tokens in rare cases + num_additional_image_tokens=0, **kwargs, ): self.patch_size = patch_size + self.num_additional_image_tokens = num_additional_image_tokens self.vision_feature_select_strategy = vision_feature_select_strategy self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token super().__init__(image_processor, tokenizer, chat_template=chat_template) @@ -152,10 +163,13 @@ def __call__( for sample in text: while self.image_token in sample: image_size = next(image_sizes) + if not isinstance(image_size, (list, tuple)): + # cast to list to avoid numerical precision errors when calculating unpadding + image_size = image_size.tolist() orig_height, orig_width = image_size num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) if self.vision_feature_select_strategy == "default": - num_image_tokens -= 1 + num_image_tokens -= self.num_additional_image_tokens sample = sample.replace(self.image_token, "" * num_image_tokens, 1) prompt_strings.append(sample) prompt_strings = [sample.replace("", self.image_token) for sample in prompt_strings] @@ -178,7 +192,7 @@ def _get_number_of_features(self, orig_height: int, orig_width: int, height: int orig_height, orig_width, patches_height, patches_width, scale_height, scale_width ) # The base patch covers the entire image (+1 for the CLS) - base_features = patches_height * patches_width + 1 + base_features = patches_height * patches_width + self.num_additional_image_tokens num_image_tokens = unpadded_features + newline_features + base_features return num_image_tokens diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py index a42aafcadd64c6..65195b77240721 100644 --- a/src/transformers/models/llava_next_video/processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py @@ -58,12 +58,22 @@ class LlavaNextVideoProcessor(ProcessorMixin): Special token used to denote video location. image_token (`str`, *optional*, defaults to `""`): Special token used to denote image location. + num_additional_image_tokens (`int`, *optional*, defaults to 0): + Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other + extra tokens appended, no need to set this arg. """ # video and image processor share same args, but have different processing logic # only image processor config is saved in the hub attributes = ["video_processor", "image_processor", "tokenizer"] - valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token", "video_token"] + valid_kwargs = [ + "chat_template", + "patch_size", + "vision_feature_select_strategy", + "image_token", + "video_token", + "num_additional_image_tokens", + ] image_processor_class = "LlavaNextImageProcessor" video_processor_class = "LlavaNextVideoImageProcessor" tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast") @@ -78,9 +88,11 @@ def __init__( vision_feature_select_strategy=None, video_token="