diff --git a/.github/workflows/check-vcs-installation.yml b/.github/workflows/check-vcs-installation.yml new file mode 100644 index 00000000000000..673d2cbba91546 --- /dev/null +++ b/.github/workflows/check-vcs-installation.yml @@ -0,0 +1,74 @@ +name: Check VCS installation + +on: + push: + branches: + - main + - v*-release + pull_request: + workflow_dispatch: + +permissions: + contents: read + +concurrency: + group: "${{ github.workflow }}-${{ github.ref }}" + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +jobs: + check-vcs-installation: + runs-on: ubuntu-20.04 # the oldest Ubuntu LTS version + timeout-minutes: 30 + steps: + - name: Setup system pip + run: | + sudo apt-get update && sudo apt-get install -y python3-dev python3-pip + echo '$ which -a python3' && which -a python3 || true + echo '$ which -a python' && which -a python || true + echo '$ which -a pip3' && which -a pip3 || true + echo '$ which -a pip' && which -a pip || true + echo '$ /usr/bin/python3 --version' && /usr/bin/python3 --version + echo '$ /usr/bin/python3 -m pip --version' && /usr/bin/python3 -m pip --version + + - name: Print commit information + run: | + if [[ "${{ github.event_name }}" != 'pull_request' ]]; then + REPOSITORY="${{ github.repository }}" + else + REPOSITORY="${{ github.event.pull_request.head.repo.full_name }}" # name of the fork repository + fi + SHA="${{ github.sha }}" + BRANCH_NAME="${{ github.head_ref || github.ref_name }}" + echo "REPOSITORY: ${REPOSITORY}" + echo "SHA: ${SHA}" + echo "BRANCH_NAME: ${BRANCH_NAME}" + echo "VCS_URL=https://github.com/${REPOSITORY}@${BRANCH_NAME}" >> "${GITHUB_ENV}" + + - name: Check transformers installation from VCS URL + run: | + /usr/bin/python3 -m pip install -vvv "git+${VCS_URL}" + (cd /tmp && /usr/bin/python3 -c 'import transformers') + /usr/bin/python3 -m pip uninstall transformers --yes + + - name: Check transformers installation from VCS URL (editable) + run: | + /usr/bin/python3 -m pip install -vvv -e "git+${VCS_URL}#egg=transformers" + (cd /tmp && /usr/bin/python3 -c 'import transformers') + /usr/bin/python3 -m pip uninstall transformers --yes + + - name: Checkout transformers + uses: actions/checkout@v4 + with: + submodules: "recursive" + + - name: Check transformers installation from VCS repo + run: | + /usr/bin/python3 -m pip install -vvv . + (cd /tmp && /usr/bin/python3 -c 'import transformers') + /usr/bin/python3 -m pip uninstall transformers --yes + + - name: Check transformers installation from VCS repo (editable) + run: | + /usr/bin/python3 -m pip install -vvv -e . + (cd /tmp && /usr/bin/python3 -c 'import transformers') + /usr/bin/python3 -m pip uninstall transformers --yes diff --git a/pyproject.toml b/pyproject.toml index bf78e0174394f5..c8108818e75dfb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,24 +1,73 @@ -[tool.coverage.run] -source = ["transformers"] -omit = [ - "*/convert_*", - "*/__main__.py" -] +# Package ###################################################################### -[tool.coverage.report] -exclude_lines = [ - "pragma: no cover", - "raise", - "except", - "register_parameter" +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "transformers" +description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" +readme = "README.md" +requires-python = ">= 3.9" # NOTE: also update the classifiers below +authors = [ + { name = "The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)", email = "transformers@huggingface.co" }, +] +license = { text = "Apache 2.0 License" } +keywords = [ + "NLP", + "vision", + "speech", + "deep learning", + "transformer", + "pytorch", + "tensorflow", + "jax", + "BERT", + "GPT-2", + "Wav2Vec2", + "ViT", +] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Topic :: Scientific/Engineering :: Artificial Intelligence", ] +dynamic = ["dependencies", "optional-dependencies", "version"] + +[project.scripts] +transformers-cli = "transformers.commands.transformers_cli:main" + +[project.urls] +Homepage = "https://huggingface.co" +Repository = "https://github.com/huggingface/transformers" +Documentation = "https://huggingface.co/docs/transformers" +"Bug Report" = "https://github.com/huggingface/transformers/issues" + +[tool.setuptools] +include-package-data = true +zip-safe = false + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.setuptools.package-data] +transformers = ["*.cu", "*.cpp", "*.cuh", "*.h", "*.pyx"] + +# Linter tools ################################################################# [tool.ruff] line-length = 119 [tool.ruff.lint] # Never enforce `E501` (line length violations). -ignore = ["C901", "E501", "E741", "F402", "F823" ] +ignore = ["C901", "E501", "E741", "F402", "F823"] select = ["C", "E", "F", "I", "W"] # Ignore import violations in all `__init__.py` files. @@ -44,11 +93,28 @@ skip-magic-trailing-comma = false # Like Black, automatically detect the appropriate line ending. line-ending = "auto" +# Testing ###################################################################### + [tool.pytest.ini_options] addopts = "--doctest-glob='**/*.md'" -doctest_optionflags="NUMBER NORMALIZE_WHITESPACE ELLIPSIS" +doctest_optionflags = "NUMBER NORMALIZE_WHITESPACE ELLIPSIS" markers = [ "flash_attn_test: marks tests related to flash attention (deselect with '-m \"not flash_attn_test\"')", "bitsandbytes: select (or deselect with `not`) bitsandbytes integration tests", - "generate: marks tests that use the GenerationTesterMixin" + "generate: marks tests that use the GenerationTesterMixin", +] + +[tool.coverage.run] +source = ["transformers"] +omit = [ + "*/convert_*", + "*/__main__.py", +] + +[tool.coverage.report] +exclude_lines = [ + "pragma: no cover", + "raise", + "except", + "register_parameter", ] diff --git a/setup.py b/setup.py index a78bb20dd0a4b0..dc4bcd40ca166a 100644 --- a/setup.py +++ b/setup.py @@ -72,21 +72,19 @@ import shutil from pathlib import Path -from setuptools import Command, find_packages, setup +from setuptools import Command, setup # Remove stale transformers.egg-info directory to avoid https://github.com/pypa/pip/issues/5466 stale_egg_info = Path(__file__).parent / "transformers.egg-info" if stale_egg_info.exists(): print( - ( - "Warning: {} exists.\n\n" - "If you recently updated transformers to 3.0 or later, this is expected,\n" - "but it may prevent transformers from installing in editable mode.\n\n" - "This directory is automatically generated by Python's packaging tools.\n" - "I will remove it now.\n\n" - "See https://github.com/pypa/pip/issues/5466 for details.\n" - ).format(stale_egg_info) + f"Warning: {stale_egg_info} exists.\n\n" + "If you recently updated transformers to 3.0 or later, this is expected,\n" + "but it may prevent transformers from installing in editable mode.\n\n" + "This directory is automatically generated by Python's packaging tools.\n" + "I will remove it now.\n\n" + "See https://github.com/pypa/pip/issues/5466 for details.\n" ) shutil.rmtree(stale_egg_info) @@ -437,36 +435,10 @@ def run(self): setup( name="transformers", - version="4.48.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) - author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)", - author_email="transformers@huggingface.co", - description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow", - long_description=open("README.md", "r", encoding="utf-8").read(), - long_description_content_type="text/markdown", - keywords="NLP vision speech deep learning transformer pytorch tensorflow jax BERT GPT-2 Wav2Vec2 ViT", - license="Apache 2.0 License", - url="https://github.com/huggingface/transformers", - package_dir={"": "src"}, - packages=find_packages("src"), - include_package_data=True, - package_data={"": ["**/*.cu", "**/*.cpp", "**/*.cuh", "**/*.h", "**/*.pyx"]}, - zip_safe=False, + # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.48.0.dev0", extras_require=extras, - entry_points={"console_scripts": ["transformers-cli=transformers.commands.transformers_cli:main"]}, - python_requires=">=3.9.0", install_requires=list(install_requires), - classifiers=[ - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "Intended Audience :: Education", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - ], cmdclass={"deps_table_update": DepsTableUpdateCommand}, ) diff --git a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py index 1c4051f2e2645c..4f1cb32348d8d3 100644 --- a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py +++ b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py @@ -179,7 +179,10 @@ def check_encoder_decoder_model_from_pretrained_using_model_paths( **kwargs, ): encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config) - with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname: + with ( + tempfile.TemporaryDirectory() as encoder_tmp_dirname, + tempfile.TemporaryDirectory() as decoder_tmp_dirname, + ): encoder_model.save_pretrained(encoder_tmp_dirname) decoder_model.save_pretrained(decoder_tmp_dirname) model_kwargs = {"encoder_hidden_dropout_prob": 0.0} @@ -306,7 +309,10 @@ def check_save_and_load_encoder_decoder_model( out_2 = outputs[0].cpu().numpy() out_2[np.isnan(out_2)] = 0 - with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname: + with ( + tempfile.TemporaryDirectory() as encoder_tmp_dirname, + tempfile.TemporaryDirectory() as decoder_tmp_dirname, + ): enc_dec_model.encoder.save_pretrained(encoder_tmp_dirname) enc_dec_model.decoder.save_pretrained(decoder_tmp_dirname) enc_dec_model = EncoderDecoderModel.from_encoder_decoder_pretrained( diff --git a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py index 897d4b056f1977..399a111530d1dd 100644 --- a/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py +++ b/tests/models/speech_encoder_decoder/test_modeling_speech_encoder_decoder.py @@ -266,7 +266,10 @@ def check_save_and_load_encoder_decoder_model( out_2 = outputs[0].cpu().numpy() out_2[np.isnan(out_2)] = 0 - with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname: + with ( + tempfile.TemporaryDirectory() as encoder_tmp_dirname, + tempfile.TemporaryDirectory() as decoder_tmp_dirname, + ): enc_dec_model.encoder.save_pretrained(encoder_tmp_dirname) enc_dec_model.decoder.save_pretrained(decoder_tmp_dirname) SpeechEncoderDecoderModel.from_encoder_decoder_pretrained( diff --git a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py index 2b517034bffb15..96bc01cbf94f3a 100644 --- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py +++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py @@ -216,7 +216,10 @@ def check_save_and_load_encoder_decoder_model( out_2 = outputs[0].cpu().numpy() out_2[np.isnan(out_2)] = 0 - with tempfile.TemporaryDirectory() as encoder_tmp_dirname, tempfile.TemporaryDirectory() as decoder_tmp_dirname: + with ( + tempfile.TemporaryDirectory() as encoder_tmp_dirname, + tempfile.TemporaryDirectory() as decoder_tmp_dirname, + ): enc_dec_model.encoder.save_pretrained(encoder_tmp_dirname) enc_dec_model.decoder.save_pretrained(decoder_tmp_dirname) VisionEncoderDecoderModel.from_encoder_decoder_pretrained( diff --git a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py index b91d66654de6ae..6dab7027f1c435 100644 --- a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py @@ -623,9 +623,10 @@ def test_wav2vec2_with_lm_pool(self): self.assertEqual(transcription[0], "bien y qué regalo vas a abrir primero") # user-managed pool + num_processes should trigger a warning - with CaptureLogger(processing_wav2vec2_with_lm.logger) as cl, multiprocessing.get_context("fork").Pool( - 2 - ) as pool: + with ( + CaptureLogger(processing_wav2vec2_with_lm.logger) as cl, + multiprocessing.get_context("fork").Pool(2) as pool, + ): transcription = processor.batch_decode(np.array(logits), pool, num_processes=2).text self.assertIn("num_process", cl.out) diff --git a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py index 7ef97290e61c9f..40e1bd8208cc29 100644 --- a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py @@ -827,9 +827,10 @@ def test_wav2vec2_with_lm_pool(self): self.assertEqual(transcription[0], "el libro ha sido escrito por cervantes") # user-managed pool + num_processes should trigger a warning - with CaptureLogger(processing_wav2vec2_with_lm.logger) as cl, multiprocessing.get_context("fork").Pool( - 2 - ) as pool: + with ( + CaptureLogger(processing_wav2vec2_with_lm.logger) as cl, + multiprocessing.get_context("fork").Pool(2) as pool, + ): transcription = processor.batch_decode(logits.numpy(), pool, num_processes=2).text self.assertIn("num_process", cl.out) diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py index b2d90adc79da96..f93467236967ba 100644 --- a/tests/models/wav2vec2/test_modeling_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py @@ -1889,9 +1889,10 @@ def test_wav2vec2_with_lm_pool(self): self.assertEqual(transcription[0], "habitan aguas poco profundas y rocosas") # user-managed pool + num_processes should trigger a warning - with CaptureLogger(processing_wav2vec2_with_lm.logger) as cl, multiprocessing.get_context("fork").Pool( - 2 - ) as pool: + with ( + CaptureLogger(processing_wav2vec2_with_lm.logger) as cl, + multiprocessing.get_context("fork").Pool(2) as pool, + ): transcription = processor.batch_decode(logits.cpu().numpy(), pool, num_processes=2).text self.assertIn("num_process", cl.out) diff --git a/utils/download_glue_data.py b/utils/download_glue_data.py index 22e9fcae471f33..64cd3752024aaf 100644 --- a/utils/download_glue_data.py +++ b/utils/download_glue_data.py @@ -79,9 +79,11 @@ def format_mrpc(data_dir, path_to_data): for row in ids_fh: dev_ids.append(row.strip().split("\t")) - with open(mrpc_train_file, encoding="utf8") as data_fh, open( - os.path.join(mrpc_dir, "train.tsv"), "w", encoding="utf8" - ) as train_fh, open(os.path.join(mrpc_dir, "dev.tsv"), "w", encoding="utf8") as dev_fh: + with ( + open(mrpc_train_file, encoding="utf8") as data_fh, + open(os.path.join(mrpc_dir, "train.tsv"), "w", encoding="utf8") as train_fh, + open(os.path.join(mrpc_dir, "dev.tsv"), "w", encoding="utf8") as dev_fh, + ): header = data_fh.readline() train_fh.write(header) dev_fh.write(header) @@ -92,9 +94,10 @@ def format_mrpc(data_dir, path_to_data): else: train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2)) - with open(mrpc_test_file, encoding="utf8") as data_fh, open( - os.path.join(mrpc_dir, "test.tsv"), "w", encoding="utf8" - ) as test_fh: + with ( + open(mrpc_test_file, encoding="utf8") as data_fh, + open(os.path.join(mrpc_dir, "test.tsv"), "w", encoding="utf8") as test_fh, + ): header = data_fh.readline() test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n") for idx, row in enumerate(data_fh):