diff --git a/.circleci/config.yml b/.circleci/config.yml index 9c414901c4f5ac..75413af8bf5254 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -58,14 +58,14 @@ jobs: name: "Prepare pipeline parameters" command: | python utils/process_test_artifacts.py - + # To avoid too long generated_config.yaml on the continuation orb, we pass the links to the artifacts as parameters. # Otherwise the list of tests was just too big. Explicit is good but for that it was a limitation. # We used: # https://circleci.com/docs/api/v2/index.html#operation/getJobArtifacts : to get the job artifacts # We could not pass a nested dict, which is why we create the test_file_... parameters for every single job - + - store_artifacts: path: test_preparation/transformed_artifacts.json - store_artifacts: diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py index 7ccf5ec96cec4f..71c75dac2ff053 100644 --- a/.circleci/create_circleci_config.py +++ b/.circleci/create_circleci_config.py @@ -32,7 +32,7 @@ "RUN_PT_FLAX_CROSS_TESTS": False, } # Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical -COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "dist": "loadfile", "vvv": None, "rsf":None} +COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "dist": "loadfile", "vvv": None, "rsfE":None} DEFAULT_DOCKER_IMAGE = [{"image": "cimg/python:3.8.12"}] @@ -40,9 +40,23 @@ class EmptyJob: job_name = "empty" def to_dict(self): + steps = [{"run": 'ls -la'}] + if self.job_name == "collection_job": + steps.extend( + [ + "checkout", + {"run": "pip install requests || true"}, + {"run": """while [[ $(curl --location --request GET "https://circleci.com/api/v2/workflow/$CIRCLE_WORKFLOW_ID/job" --header "Circle-Token: $CCI_TOKEN"| jq -r '.items[]|select(.name != "collection_job")|.status' | grep -c "running") -gt 0 ]]; do sleep 5; done || true"""}, + {"run": 'python utils/process_circleci_workflow_test_reports.py --workflow_id $CIRCLE_WORKFLOW_ID || true'}, + {"store_artifacts": {"path": "outputs"}}, + {"run": 'echo "All required jobs have now completed"'}, + ] + ) + return { "docker": copy.deepcopy(DEFAULT_DOCKER_IMAGE), - "steps":["checkout"], + "resource_class": "small", + "steps": steps, } @@ -54,9 +68,9 @@ class CircleCIJob: install_steps: List[str] = None marker: Optional[str] = None parallelism: Optional[int] = 0 - pytest_num_workers: int = 12 + pytest_num_workers: int = 8 pytest_options: Dict[str, Any] = None - resource_class: Optional[str] = "2xlarge" + resource_class: Optional[str] = "xlarge" tests_to_run: Optional[List[str]] = None num_test_files_per_worker: Optional[int] = 10 # This should be only used for doctest job! @@ -133,7 +147,7 @@ def to_dict(self): "command": """dpkg-query --show --showformat='${Installed-Size}\t${Package}\n' | sort -rh | head -25 | sort -h | awk '{ package=$2; sub(".*/", "", package); printf("%.5f GB %s\n", $1/1024/1024, package)}' || true"""} }, {"run": {"name": "Create `test-results` directory", "command": "mkdir test-results"}}, - {"run": {"name": "Get files to test", "command":f'curl -L -o {self.job_name}_test_list.txt <>' if self.name != "pr_documentation_tests" else 'echo "Skipped"'}}, + {"run": {"name": "Get files to test", "command":f'curl -L -o {self.job_name}_test_list.txt <> --header "Circle-Token: $CIRCLE_TOKEN"' if self.name != "pr_documentation_tests" else 'echo "Skipped"'}}, {"run": {"name": "Split tests across parallel nodes: show current parallel tests", "command": f"TESTS=$(circleci tests split --split-by=timings {self.job_name}_test_list.txt) && echo $TESTS > splitted_tests.txt && echo $TESTS | tr ' ' '\n'" if self.parallelism else f"awk '{{printf \"%s \", $0}}' {self.job_name}_test_list.txt > splitted_tests.txt" } @@ -185,7 +199,6 @@ def job_name(self): docker_image=[{"image": "huggingface/transformers-torch-light"}], marker="not generate", parallelism=6, - pytest_num_workers=8 ) generate_job = CircleCIJob( @@ -193,28 +206,24 @@ def job_name(self): docker_image=[{"image": "huggingface/transformers-torch-light"}], marker="generate", parallelism=6, - pytest_num_workers=8 ) tokenization_job = CircleCIJob( "tokenization", docker_image=[{"image": "huggingface/transformers-torch-light"}], parallelism=8, - pytest_num_workers=16 ) processor_job = CircleCIJob( "processors", docker_image=[{"image": "huggingface/transformers-torch-light"}], parallelism=8, - pytest_num_workers=6 ) tf_job = CircleCIJob( "tf", docker_image=[{"image":"huggingface/transformers-tf-light"}], parallelism=6, - pytest_num_workers=16, ) @@ -222,7 +231,8 @@ def job_name(self): "flax", docker_image=[{"image":"huggingface/transformers-jax-light"}], parallelism=6, - pytest_num_workers=16 + pytest_num_workers=16, + resource_class="2xlarge", ) @@ -231,7 +241,7 @@ def job_name(self): additional_env={"RUN_PIPELINE_TESTS": True}, docker_image=[{"image":"huggingface/transformers-torch-light"}], marker="is_pipeline_test", - parallelism=4 + parallelism=4, ) @@ -240,7 +250,7 @@ def job_name(self): additional_env={"RUN_PIPELINE_TESTS": True}, docker_image=[{"image":"huggingface/transformers-tf-light"}], marker="is_pipeline_test", - parallelism=4 + parallelism=4, ) @@ -257,7 +267,6 @@ def job_name(self): docker_image=[{"image":"huggingface/transformers-examples-torch"}], # TODO @ArthurZucker remove this once docker is easier to build install_steps=["uv venv && uv pip install . && uv pip install -r examples/pytorch/_tests_requirements.txt"], - pytest_num_workers=8, ) @@ -265,7 +274,6 @@ def job_name(self): "examples_tensorflow", additional_env={"OMP_NUM_THREADS": 8}, docker_image=[{"image":"huggingface/transformers-examples-tf"}], - pytest_num_workers=16, ) @@ -280,6 +288,7 @@ def job_name(self): ], marker="is_staging_test", pytest_num_workers=2, + resource_class="medium", ) @@ -292,13 +301,13 @@ def job_name(self): ], pytest_options={"k onnx": None}, pytest_num_workers=1, + resource_class="small", ) exotic_models_job = CircleCIJob( "exotic_models", docker_image=[{"image":"huggingface/transformers-exotic-models"}], - pytest_num_workers=12, parallelism=4, pytest_options={"durations": 100}, ) @@ -317,7 +326,6 @@ def job_name(self): docker_image=[{"image": "huggingface/transformers-torch-light"}], marker="not generate", parallelism=6, - pytest_num_workers=8, ) @@ -352,6 +360,7 @@ def job_name(self): DOC_TESTS = [doc_test_job] ALL_TESTS = REGULAR_TESTS + EXAMPLES_TESTS + PIPELINE_TESTS + REPO_UTIL_TESTS + DOC_TESTS + [custom_tokenizers_job] + [exotic_models_job] # fmt: skip + def create_circleci_config(folder=None): if folder is None: folder = os.getcwd() @@ -361,7 +370,13 @@ def create_circleci_config(folder=None): if len(jobs) == 0: jobs = [EmptyJob()] - print("Full list of job name inputs", {j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs}) + else: + print("Full list of job name inputs", {j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs}) + # Add a job waiting all the test jobs and aggregate their test summary files at the end + collection_job = EmptyJob() + collection_job.job_name = "collection_job" + jobs = [collection_job] + jobs + config = { "version": "2.1", "parameters": { @@ -371,9 +386,14 @@ def create_circleci_config(folder=None): **{j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs}, **{j.job_name + "_parallelism":{"type":"integer", "default":1} for j in jobs}, }, - "jobs" : {j.job_name: j.to_dict() for j in jobs}, - "workflows": {"version": 2, "run_tests": {"jobs": [j.job_name for j in jobs]}} + "jobs": {j.job_name: j.to_dict() for j in jobs} } + if "CIRCLE_TOKEN" in os.environ: + # For private forked repo. (e.g. new model addition) + config["workflows"] = {"version": 2, "run_tests": {"jobs": [{j.job_name: {"context": ["TRANSFORMERS_CONTEXT"]}} for j in jobs]}} + else: + # For public repo. (e.g. `transformers`) + config["workflows"] = {"version": 2, "run_tests": {"jobs": [j.job_name for j in jobs]}} with open(os.path.join(folder, "generated_config.yml"), "w") as f: f.write(yaml.dump(config, sort_keys=False, default_flow_style=False).replace("' << pipeline", " << pipeline").replace(">> '", " >>")) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index eaa4b3b2f82456..1bbd1c1e94d08c 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -63,7 +63,7 @@ jobs: commit_id=$GITHUB_SHA fi commit_msg=$(git show -s --format=%s | cut -c1-70) - python3 benchmark/llama.py "${{ github.head_ref || github.ref_name }}" "$commit_id" "$commit_msg" + python3 benchmark/benchmarks_entrypoint.py "${{ github.head_ref || github.ref_name }}" "$commit_id" "$commit_msg" env: HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} # Enable this to see debug logs diff --git a/.github/workflows/push-important-models.yml b/.github/workflows/push-important-models.yml index 1887af0f4c5bac..7294777655e183 100644 --- a/.github/workflows/push-important-models.yml +++ b/.github/workflows/push-important-models.yml @@ -134,10 +134,3 @@ jobs: slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }} slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} waitForSSH: true - - benchmark: - name: Benchmark workflow - needs: get_modified_models - if: ${{ needs.get_modified_models.outputs.matrix != '[]' && needs.get_modified_models.outputs.matrix != '' && fromJson(needs.get_modified_models.outputs.matrix)[0] != null }} - uses: ./.github/workflows/benchmark.yml - secrets: inherit diff --git a/.github/workflows/self-comment-ci.yml b/.github/workflows/self-comment-ci.yml new file mode 100644 index 00000000000000..b344ecfd59527d --- /dev/null +++ b/.github/workflows/self-comment-ci.yml @@ -0,0 +1,253 @@ +name: PR comment GitHub CI + +on: + issue_comment: + types: + - created + branches-ignore: + - main +concurrency: + group: ${{ github.workflow }}-${{ github.event.issue.number }}-${{ startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow') }} + cancel-in-progress: true + +jobs: + get-pr-number: + runs-on: ubuntu-22.04 + name: Get PR number + # For security: only allow team members to run + if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }} + outputs: + PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }} + steps: + - name: Get PR number + shell: bash + run: | + if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then + echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV + else + echo "PR_NUMBER=" >> $GITHUB_ENV + fi + + - name: Check PR number + shell: bash + run: | + echo "${{ env.PR_NUMBER }}" + + - name: Set PR number + id: set_pr_number + run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT" + + get-sha: + runs-on: ubuntu-22.04 + needs: get-pr-number + if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}} + outputs: + PR_HEAD_SHA: ${{ steps.get_sha.outputs.PR_HEAD_SHA }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: "0" + ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge" + + - name: Get SHA + id: get_sha + env: + PR_NUMBER: ${{needs.get-pr-number.outputs.PR_NUMBER}} + run: | + git fetch origin refs/pull/$PR_NUMBER/head:refs/remotes/pull/$PR_NUMBER/head + git checkout refs/remotes/pull/$PR_NUMBER/head + echo "PR_HEAD_SHA: $(git log -1 --format=%H)" + echo "PR_HEAD_SHA=$(git log -1 --format=%H)" >> "$GITHUB_OUTPUT" + + # use a python script to handle this complex logic + # case 1: `run-slow` (auto. infer with limited number of models, but in particular, new model) + # case 2: `run-slow model_1, model_2` + get-tests: + runs-on: ubuntu-22.04 + needs: get-pr-number + if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}} + permissions: write-all + outputs: + models: ${{ steps.models_to_run.outputs.models }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: "0" + ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge" + + - name: Get models to test + env: + PR_COMMENT: ${{ github.event.comment.body }} + run: | + python -m pip install GitPython + python utils/pr_slow_ci_models.py --message "$PR_COMMENT" | tee output.txt + echo "models=$(tail -n 1 output.txt)" >> $GITHUB_ENV + + - name: Show models to test + id: models_to_run + run: | + echo "${{ env.models }}" + echo "models=${{ env.models }}" >> $GITHUB_ENV + echo "models=${{ env.models }}" >> $GITHUB_OUTPUT + + - name: Reply to the comment + if: ${{ env.models != '[]' }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh api \ + --method POST \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \ + -f "body=This comment contains run-slow, running the specified jobs: ${{ env.models }} ..." + + create_run: + name: Create run + if: ${{ needs.get-tests.outputs.models != '[]' }} + needs: [get-sha, get-tests] + permissions: write-all + runs-on: ubuntu-22.04 + steps: + - name: Create Run + id: create_run + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Create a commit status (pending) for a run of this workflow. The status has to be updated later in `update_run_status`. + # See https://docs.github.com/en/rest/commits/statuses?apiVersion=2022-11-28#create-a-commit-status + GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} + run: | + gh api \ + --method POST \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \ + -f "target_url=$GITHUB_RUN_URL" -f "state=pending" -f "description=Slow CI job" -f "context=pytest/custom-tests" + + run_models_gpu: + name: Run all tests for the model + if: ${{ needs.get-tests.outputs.models != '[]' }} + needs: [get-pr-number, get-tests, create_run] + strategy: + fail-fast: false + matrix: + folders: ${{ fromJson(needs.get-tests.outputs.models) }} + machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + runs-on: + group: '${{ matrix.machine_type }}' + container: + image: huggingface/transformers-all-latest-gpu + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Echo input and matrix info + shell: bash + run: | + echo "${{ matrix.folders }}" + + - name: Echo folder ${{ matrix.folders }} + shell: bash + # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # set the artifact folder names (because the character `/` is not allowed). + run: | + echo "${{ matrix.folders }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'models/'/'models_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + - name: Checkout to PR merge commit + working-directory: /transformers + run: | + git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge + git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge + git log -1 --format=%H + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Set `machine_type` for report and artifact names + working-directory: /transformers + shell: bash + run: | + echo "${{ matrix.machine_type }}" + if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + machine_type=single-gpu + elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then + machine_type=multi-gpu + else + machine_type=${{ matrix.machine_type }} + fi + echo "$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all tests on GPU + working-directory: /transformers + run: | + export CUDA_VISIBLE_DEVICES="$(python3 utils/set_cuda_devices_for_ci.py --test_folder ${{ matrix.folders }})" + echo $CUDA_VISIBLE_DEVICES + python3 -m pytest -v -rsfE --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt + + - name: Make sure report directory exists + shell: bash + run: | + mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports + echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt + echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports" + + - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports + + update_run_status: + name: Update Check Run Status + needs: [get-sha, create_run, run_models_gpu] + permissions: write-all + if: ${{ always() && needs.create_run.result == 'success' }} + runs-on: ubuntu-22.04 + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} + steps: + - name: Get `run_models_gpu` job status + run: | + echo "${{ needs.run_models_gpu.result }}" + if [ "${{ needs.run_models_gpu.result }}" = "cancelled" ]; then + echo "STATUS=failure" >> $GITHUB_ENV + elif [ "${{ needs.run_models_gpu.result }}" = "skipped" ]; then + echo "STATUS=success" >> $GITHUB_ENV + else + echo "STATUS=${{ needs.run_models_gpu.result }}" >> $GITHUB_ENV + fi + + - name: Update PR commit statuses + run: | + echo "${{ needs.run_models_gpu.result }}" + echo "${{ env.STATUS }}" + gh api \ + --method POST \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \ + -f "target_url=$GITHUB_RUN_URL" -f "state=${{ env.STATUS }}" -f "description=Slow CI job" -f "context=pytest/custom-tests" diff --git a/.github/workflows/self-nightly-past-ci-caller.yml b/.github/workflows/self-nightly-past-ci-caller.yml index 142399a6366ce6..46d811d4a43394 100644 --- a/.github/workflows/self-nightly-past-ci-caller.yml +++ b/.github/workflows/self-nightly-past-ci-caller.yml @@ -21,39 +21,6 @@ jobs: echo "$(python3 -c 'print(int(${{ github.run_number }}) % 10)')" echo "run_number=$(python3 -c 'print(int(${{ github.run_number }}) % 10)')" >> $GITHUB_OUTPUT - run_past_ci_pytorch_1-13: - name: PyTorch 1.13 - needs: get_number - if: needs.get_number.outputs.run_number == 0 && (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) - uses: ./.github/workflows/self-past-caller.yml - with: - framework: pytorch - version: "1.13" - sha: ${{ github.sha }} - secrets: inherit - - run_past_ci_pytorch_1-12: - name: PyTorch 1.12 - needs: get_number - if: needs.get_number.outputs.run_number == 1 && (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) - uses: ./.github/workflows/self-past-caller.yml - with: - framework: pytorch - version: "1.12" - sha: ${{ github.sha }} - secrets: inherit - - run_past_ci_pytorch_1-11: - name: PyTorch 1.11 - needs: get_number - if: needs.get_number.outputs.run_number == 2 && (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) - uses: ./.github/workflows/self-past-caller.yml - with: - framework: pytorch - version: "1.11" - sha: ${{ github.sha }} - secrets: inherit - run_past_ci_tensorflow_2-11: name: TensorFlow 2.11 needs: get_number diff --git a/.github/workflows/self-pr-slow-ci.yml b/.github/workflows/self-pr-slow-ci.yml deleted file mode 100644 index 43fcecd8def21e..00000000000000 --- a/.github/workflows/self-pr-slow-ci.yml +++ /dev/null @@ -1,151 +0,0 @@ -name: PR slow CI - -on: - pull_request: - paths: - - "src/transformers/models/*/modeling_*.py" - - "tests/**/test_*.py" - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -env: - HF_HOME: /mnt/cache - TRANSFORMERS_IS_CI: yes - OMP_NUM_THREADS: 8 - MKL_NUM_THREADS: 8 - RUN_SLOW: yes - # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. - # This token is created under the bot `hf-transformers-bot`. - HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} - SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} - TF_FORCE_GPU_ALLOW_GROWTH: true - RUN_PT_TF_CROSS_TESTS: 1 - CUDA_VISIBLE_DEVICES: 0,1 - -jobs: - find_models_to_run: - runs-on: ubuntu-22.04 - name: Find models to run slow tests - # Triggered only if the required label `run-slow` is added - if: ${{ contains(github.event.pull_request.labels.*.name, 'run-slow') }} - outputs: - models: ${{ steps.models_to_run.outputs.models }} - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: "0" - ref: ${{ github.event.pull_request.head.sha }} - - - name: Get commit message - run: | - echo "commit_message=$(git show -s --format=%s)" >> $GITHUB_ENV - - - name: Get models to run slow tests - run: | - echo "${{ env.commit_message }}" - python -m pip install GitPython - python utils/pr_slow_ci_models.py --commit_message "${{ env.commit_message }}" | tee output.txt - echo "models=$(tail -n 1 output.txt)" >> $GITHUB_ENV - - - name: Models to run slow tests - id: models_to_run - run: | - echo "${{ env.models }}" - echo "models=${{ env.models }}" >> $GITHUB_OUTPUT - - run_models_gpu: - name: Run all tests for the model - # Triggered only `find_models_to_run` is triggered (label `run-slow` is added) which gives the models to run - # (either a new model PR or via a commit message) - if: ${{ needs.find_models_to_run.outputs.models != '[]' }} - needs: find_models_to_run - strategy: - fail-fast: false - matrix: - folders: ${{ fromJson(needs.find_models_to_run.outputs.models) }} - machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] - runs-on: - group: '${{ matrix.machine_type }}' - container: - image: huggingface/transformers-all-latest-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - steps: - - name: Echo input and matrix info - shell: bash - run: | - echo "${{ matrix.folders }}" - - - name: Echo folder ${{ matrix.folders }} - shell: bash - # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # set the artifact folder names (because the character `/` is not allowed). - run: | - echo "${{ matrix.folders }}" - matrix_folders=${{ matrix.folders }} - matrix_folders=${matrix_folders/'models/'/'models_'} - echo "$matrix_folders" - echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - - name: Update clone - working-directory: /transformers - run: git fetch && git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/merge && git checkout pull/${{ github.event.pull_request.number }}/merge - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . && python3 -m pip install --upgrade torch torchaudio torchvision - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Set `machine_type` for report and artifact names - working-directory: /transformers - shell: bash - run: | - echo "${{ matrix.machine_type }}" - if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then - machine_type=single-gpu - elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then - machine_type=multi-gpu - else - machine_type=${{ matrix.machine_type }} - fi - echo "$machine_type" - echo "machine_type=$machine_type" >> $GITHUB_ENV - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /transformers - run: | - export CUDA_VISIBLE_DEVICES="$(python3 utils/set_cuda_devices_for_ci.py --test_folder ${{ matrix.folders }})" - echo $CUDA_VISIBLE_DEVICES - python3 -m pytest -v -rsfE --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt - - - name: Make sure report directory exists - shell: bash - run: | - mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports - echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt - echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports" - - - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports" - if: ${{ always() }} - uses: actions/upload-artifact@v4 - with: - name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports diff --git a/.github/workflows/self-push-amd-mi210-caller.yml b/.github/workflows/self-push-amd-mi210-caller.yml index a401e40ee7f164..45b325f7b357bf 100644 --- a/.github/workflows/self-push-amd-mi210-caller.yml +++ b/.github/workflows/self-push-amd-mi210-caller.yml @@ -1,25 +1,25 @@ -name: Self-hosted runner (AMD mi210 CI caller) - -on: - workflow_run: - workflows: ["Self-hosted runner (push-caller)"] - branches: ["main"] - types: [completed] - push: - branches: - - run_amd_push_ci_caller* - paths: - - "src/**" - - "tests/**" - - ".github/**" - - "templates/**" - - "utils/**" - -jobs: - run_amd_ci: - name: AMD mi210 - if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller'))) - uses: ./.github/workflows/self-push-amd.yml - with: - gpu_flavor: mi210 - secrets: inherit +name: Self-hosted runner (AMD mi210 CI caller) + +on: + #workflow_run: + # workflows: ["Self-hosted runner (push-caller)"] + # branches: ["main"] + # types: [completed] + push: + branches: + - run_amd_push_ci_caller* + paths: + - "src/**" + - "tests/**" + - ".github/**" + - "templates/**" + - "utils/**" + +jobs: + run_amd_ci: + name: AMD mi210 + if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller'))) + uses: ./.github/workflows/self-push-amd.yml + with: + gpu_flavor: mi210 + secrets: inherit diff --git a/.github/workflows/self-push-amd-mi250-caller.yml b/.github/workflows/self-push-amd-mi250-caller.yml index fef532703170cb..91b978b593d0b5 100644 --- a/.github/workflows/self-push-amd-mi250-caller.yml +++ b/.github/workflows/self-push-amd-mi250-caller.yml @@ -1,25 +1,25 @@ -name: Self-hosted runner (AMD mi250 CI caller) - -on: - workflow_run: - workflows: ["Self-hosted runner (push-caller)"] - branches: ["main"] - types: [completed] - push: - branches: - - run_amd_push_ci_caller* - paths: - - "src/**" - - "tests/**" - - ".github/**" - - "templates/**" - - "utils/**" - -jobs: - run_amd_ci: - name: AMD mi250 - if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller'))) - uses: ./.github/workflows/self-push-amd.yml - with: - gpu_flavor: mi250 - secrets: inherit +name: Self-hosted runner (AMD mi250 CI caller) + +on: + #workflow_run: + # workflows: ["Self-hosted runner (push-caller)"] + # branches: ["main"] + # types: [completed] + push: + branches: + - run_amd_push_ci_caller* + paths: + - "src/**" + - "tests/**" + - ".github/**" + - "templates/**" + - "utils/**" + +jobs: + run_amd_ci: + name: AMD mi250 + if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller'))) + uses: ./.github/workflows/self-push-amd.yml + with: + gpu_flavor: mi250 + secrets: inherit diff --git a/.github/workflows/self-push-amd-mi300-caller.yml b/.github/workflows/self-push-amd-mi300-caller.yml index a8ee4e540ecf3f..797916125a24fb 100644 --- a/.github/workflows/self-push-amd-mi300-caller.yml +++ b/.github/workflows/self-push-amd-mi300-caller.yml @@ -1,10 +1,10 @@ name: Self-hosted runner (AMD mi300 CI caller) on: - workflow_run: - workflows: ["Self-hosted runner (push-caller)"] - branches: ["main"] - types: [completed] + #workflow_run: + # workflows: ["Self-hosted runner (push-caller)"] + # branches: ["main"] + # types: [completed] push: branches: - run_amd_push_ci_caller* diff --git a/README.md b/README.md index c748e675066202..42403f84b885da 100644 --- a/README.md +++ b/README.md @@ -249,7 +249,7 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta ### With pip -This repository is tested on Python 3.9+, Flax 0.4.1+, PyTorch 1.11+, and TensorFlow 2.6+. +This repository is tested on Python 3.9+, Flax 0.4.1+, PyTorch 2.0+, and TensorFlow 2.6+. You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 00000000000000..a827da444f0801 --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,49 @@ +# Benchmarks + +You might want to add new benchmarks. + +You will need to define a python function named `run_benchmark` in your python file and the file must be located in this `benchmark/` directory. + +The expected function signature is the following: + +```py +def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100): +``` + +## Writing metrics to the database + +`MetricRecorder` is thread-safe, in the sense of the python [`Thread`](https://docs.python.org/3/library/threading.html#threading.Thread). This means you can start a background thread to do the readings on the device measurements while not blocking the main thread to execute the model measurements. + +cf [`llama.py`](./llama.py) to see an example of this in practice. + +```py +from benchmarks_entrypoint import MetricsRecorder +import psycopg2 + +def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100): + metrics_recorder = MetricsRecorder(psycopg2.connect("dbname=metrics"), logger, branch, commit_id, commit_msg) + benchmark_id = metrics_recorder.initialise_benchmark({"gpu_name": gpu_name, "model_id": model_id}) + # To collect device measurements + metrics_recorder.collect_device_measurements( + benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes + ) + # To collect your model measurements + metrics_recorder.collect_model_measurements( + benchmark_id, + { + "model_load_time": model_load_time, + "first_eager_forward_pass_time_secs": first_eager_fwd_pass_time, + "second_eager_forward_pass_time_secs": second_eager_fwd_pass_time, + "first_eager_generate_time_secs": first_eager_generate_time, + "second_eager_generate_time_secs": second_eager_generate_time, + "time_to_first_token_secs": time_to_first_token, + "time_to_second_token_secs": time_to_second_token, + "time_to_third_token_secs": time_to_third_token, + "time_to_next_token_mean_secs": mean_time_to_next_token, + "first_compile_generate_time_secs": first_compile_generate_time, + "second_compile_generate_time_secs": second_compile_generate_time, + "third_compile_generate_time_secs": third_compile_generate_time, + "fourth_compile_generate_time_secs": fourth_compile_generate_time, + }, + ) +``` diff --git a/benchmark/benchmarks_entrypoint.py b/benchmark/benchmarks_entrypoint.py new file mode 100644 index 00000000000000..7925e2902834f7 --- /dev/null +++ b/benchmark/benchmarks_entrypoint.py @@ -0,0 +1,144 @@ +import argparse +import importlib.util +import logging +import os +from typing import Dict +import psycopg2 +import sys + +from psycopg2.extras import Json +from psycopg2.extensions import register_adapter + + +register_adapter(dict, Json) + + +class ImportModuleException(Exception): + pass + + +class MetricsRecorder: + def __init__(self, connection, logger: logging.Logger, branch: str, commit_id: str, commit_msg: str): + self.conn = connection + self.conn.autocommit = True + self.logger = logger + self.branch = branch + self.commit_id = commit_id + self.commit_msg = commit_msg + + def initialise_benchmark(self, metadata: Dict[str, str]) -> int: + """ + Creates a new benchmark, returns the benchmark id + """ + # gpu_name: str, model_id: str + with self.conn.cursor() as cur: + cur.execute( + "INSERT INTO benchmarks (branch, commit_id, commit_message, metadata) VALUES (%s, %s, %s, %s) RETURNING benchmark_id", + (self.branch, self.commit_id, self.commit_msg, metadata), + ) + benchmark_id = cur.fetchone()[0] + logger.debug(f"initialised benchmark #{benchmark_id}") + return benchmark_id + + def collect_device_measurements(self, benchmark_id: int, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes): + """ + Collect device metrics, such as CPU & GPU usage. These are "static", as in you cannot pass arbitrary arguments to the function. + """ + with self.conn.cursor() as cur: + cur.execute( + "INSERT INTO device_measurements (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes) VALUES (%s, %s, %s, %s, %s)", + (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes), + ) + self.logger.debug( + f"inserted device measurements for benchmark #{benchmark_id} [CPU util: {cpu_util}, mem MBs: {mem_megabytes}, GPU util: {gpu_util}, GPU mem MBs: {gpu_mem_megabytes}]" + ) + + def collect_model_measurements(self, benchmark_id: int, measurements: Dict[str, float]): + with self.conn.cursor() as cur: + cur.execute( + """ + INSERT INTO model_measurements ( + benchmark_id, + measurements + ) VALUES (%s, %s) + """, + ( + benchmark_id, + measurements, + ), + ) + self.logger.debug(f"inserted model measurements for benchmark #{benchmark_id}: {measurements}") + + def close(self): + self.conn.close() + + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +handler = logging.StreamHandler(sys.stdout) +handler.setLevel(logging.INFO) +formatter = logging.Formatter("[%(levelname)s - %(asctime)s] %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) + + +def parse_arguments(): + """ + Parse command line arguments for the benchmarking CLI. + """ + parser = argparse.ArgumentParser(description="CLI for benchmarking the huggingface/transformers.") + + parser.add_argument( + "branch", + type=str, + help="The branch name on which the benchmarking is performed.", + ) + + parser.add_argument( + "commit_id", + type=str, + help="The commit hash on which the benchmarking is performed.", + ) + + parser.add_argument( + "commit_msg", + type=str, + help="The commit message associated with the commit, truncated to 70 characters.", + ) + + args = parser.parse_args() + + return args.branch, args.commit_id, args.commit_msg + + +def import_from_path(module_name, file_path): + try: + spec = importlib.util.spec_from_file_location(module_name, file_path) + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module + except Exception as e: + raise ImportModuleException(f"failed to load python module: {e}") + + +if __name__ == "__main__": + benchmarks_folder_path = os.path.dirname(os.path.realpath(__file__)) + + branch, commit_id, commit_msg = parse_arguments() + + for entry in os.scandir(benchmarks_folder_path): + try: + if not entry.name.endswith(".py"): + continue + if entry.path == __file__: + continue + logger.debug(f"loading: {entry.name}") + module = import_from_path(entry.name.split(".")[0], entry.path) + logger.info(f"runnning benchmarks in: {entry.name}") + module.run_benchmark(logger, branch, commit_id, commit_msg) + except ImportModuleException as e: + logger.error(e) + except Exception as e: + logger.error(f"error running benchmarks for {entry.name}: {e}") diff --git a/benchmark/default.yml b/benchmark/default.yml new file mode 100644 index 00000000000000..f3f02cab34d1bd --- /dev/null +++ b/benchmark/default.yml @@ -0,0 +1,10 @@ +apiVersion: 1 + +providers: + - name: 'Transformers Benchmarks' + orgId: 1 + type: file + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /etc/grafana/dashboards diff --git a/benchmark/grafana_dashboard.json b/benchmark/grafana_dashboard.json index 3d579f7b368711..caaec78a522303 100644 --- a/benchmark/grafana_dashboard.json +++ b/benchmark/grafana_dashboard.json @@ -30,7 +30,7 @@ "title": "Go to data", "tooltip": "Go to data", "type": "link", - "url": "http://transformers-benchmarks.huggingface.co/d/fdz33iyzln9c0a/transformers-benchmarks?orgId=1&from=${StartTime}&to=${EndTime}" + "url": "http://transformers-benchmarks.hf.co/d/fdz33iyzln9c0a/transformers-benchmarks?orgId=1&from=${StartTime}&to=${EndTime}" } ], "liveNow": true, @@ -77,7 +77,7 @@ "properties": [ { "id": "custom.width", - "value": 196 + "value": 202 } ] }, @@ -101,7 +101,7 @@ "properties": [ { "id": "custom.width", - "value": 581 + "value": 524 } ] }, @@ -113,7 +113,19 @@ "properties": [ { "id": "custom.width", - "value": 379 + "value": 353 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "model_id" + }, + "properties": [ + { + "id": "custom.width", + "value": 216 } ] } @@ -143,12 +155,14 @@ "targets": [ { "datasource": { - "type": "grafana-postgresql-datasource" + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT commit_id as commit_id, commit_message, gpu_name, created_at AS date FROM benchmarks WHERE branch = '${branch}' ORDER BY benchmark_id DESC LIMIT ${last_n_commits};", + "rawSql": "SELECT commit_id, commit_message, metadata->>'gpu_name' as gpu_name, metadata->>'model_id' as model_id, created_at AS date FROM benchmarks WHERE branch = '${branch}' AND metadata->>'gpu_name' = '${gpu_name}' ORDER BY benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -306,13 +320,14 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -431,13 +446,14 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -565,13 +581,14 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -686,13 +703,14 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -807,13 +825,14 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -928,13 +947,14 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -1062,13 +1082,14 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -1183,13 +1204,14 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -1304,13 +1326,14 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -1425,13 +1448,14 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", "rawQuery": true, - "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", "refId": "A", "sql": { "columns": [ @@ -1480,11 +1504,7 @@ "id": 15, "panels": [ { - "datasource": { - "default": true, - "type": "grafana-postgresql-datasource", - "uid": "be28nkzirtb0gd" - }, + "datasource": {}, "fieldConfig": { "defaults": { "color": { @@ -1528,8 +1548,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1563,8 +1582,9 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", @@ -1665,11 +1685,7 @@ "type": "timeseries" }, { - "datasource": { - "default": true, - "type": "grafana-postgresql-datasource", - "uid": "be28nkzirtb0gd" - }, + "datasource": {}, "fieldConfig": { "defaults": { "color": { @@ -1713,8 +1729,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1748,8 +1763,9 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", @@ -1850,11 +1866,7 @@ "type": "timeseries" }, { - "datasource": { - "default": true, - "type": "grafana-postgresql-datasource", - "uid": "be28nkzirtb0gd" - }, + "datasource": {}, "fieldConfig": { "defaults": { "color": { @@ -1898,8 +1910,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1933,8 +1944,9 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", @@ -2035,11 +2047,7 @@ "type": "timeseries" }, { - "datasource": { - "default": true, - "type": "grafana-postgresql-datasource", - "uid": "be28nkzirtb0gd" - }, + "datasource": {}, "fieldConfig": { "defaults": { "color": { @@ -2083,8 +2091,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -2118,8 +2125,9 @@ "targets": [ { "datasource": { + "default": true, "type": "grafana-postgresql-datasource", - "uid": "bdz2yss7sxo1sc" + "uid": "be28nkzirtb0gd" }, "editorMode": "code", "format": "table", @@ -2224,7 +2232,6 @@ "type": "row" } ], - "refresh": "", "schemaVersion": 39, "tags": [], "templating": { @@ -2236,6 +2243,7 @@ "value": "main" }, "datasource": { + "default": true, "type": "grafana-postgresql-datasource", "uid": "be28nkzirtb0gd" }, @@ -2248,7 +2256,7 @@ "name": "branch", "options": [], "query": "SELECT DISTINCT branch FROM benchmarks;", - "refresh": 2, + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, @@ -2261,6 +2269,7 @@ "value": "1729701492845" }, "datasource": { + "default": true, "type": "grafana-postgresql-datasource", "uid": "be28nkzirtb0gd" }, @@ -2281,10 +2290,11 @@ { "current": { "selected": false, - "text": "1730120430069", - "value": "1730120430069" + "text": "1730393397577", + "value": "1730393397577" }, "datasource": { + "default": true, "type": "grafana-postgresql-datasource", "uid": "be28nkzirtb0gd" }, @@ -2312,15 +2322,16 @@ "type": "grafana-postgresql-datasource", "uid": "be28nkzirtb0gd" }, - "definition": "SELECT DISTINCT gpu_name FROM benchmarks;", + "definition": "SELECT DISTINCT metadata->>'gpu_name' FROM benchmarks;", + "description": "", "hide": 0, "includeAll": false, "label": "GPU", "multi": false, "name": "gpu_name", "options": [], - "query": "SELECT DISTINCT gpu_name FROM benchmarks;", - "refresh": 2, + "query": "SELECT DISTINCT metadata->>'gpu_name' FROM benchmarks;", + "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, @@ -2328,7 +2339,7 @@ }, { "current": { - "selected": false, + "selected": true, "text": "10", "value": "10" }, @@ -2359,6 +2370,6 @@ "timezone": "browser", "title": "Transformers benchmarks", "uid": "fdz33iyzln9c0a", - "version": 4, + "version": 10, "weekStart": "" } diff --git a/benchmark/grafana_datasource.yaml b/benchmark/grafana_datasource.yaml new file mode 100644 index 00000000000000..25f36254104ab5 --- /dev/null +++ b/benchmark/grafana_datasource.yaml @@ -0,0 +1,17 @@ +apiVersion: 1 +datasources: + - name: grafana-postgresql-datasource + uid: be28nkzirtb0gd + type: postgres + url: $GRAFANA_POSTGRES_DATASOURCE_URL + user: $GRAFANA_POSTGRES_DATASOURCE_USER + secureJsonData: + password: $GRAFANA_POSTGRES_DATASOURCE_PWD + jsonData: + database: metrics + maxOpenConns: 100 + maxIdleConns: 100 + maxIdleConnsAuto: true + connMaxLifetime: 14400 + postgresVersion: 1000 + timescaledb: false diff --git a/benchmark/init_db.sql b/benchmark/init_db.sql index 573cc11518e857..a7864c4af183b6 100644 --- a/benchmark/init_db.sql +++ b/benchmark/init_db.sql @@ -3,7 +3,7 @@ CREATE TABLE IF NOT EXISTS benchmarks ( branch VARCHAR(255), commit_id VARCHAR(72), commit_message VARCHAR(70), - gpu_name VARCHAR(255), + metadata jsonb, created_at timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC') ); diff --git a/benchmark/llama.py b/benchmark/llama.py index 4a2c57422e6ffb..bbe1afefd5ef1b 100644 --- a/benchmark/llama.py +++ b/benchmark/llama.py @@ -1,71 +1,25 @@ -import argparse -import json -import logging +from logging import Logger import os -import sys -from statistics import mean from threading import Event, Thread from time import perf_counter, sleep from typing import Optional +from benchmarks_entrypoint import MetricsRecorder import gpustat import psutil import psycopg2 import torch from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache -from psycopg2.extras import Json -from psycopg2.extensions import register_adapter os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - -handler = logging.StreamHandler(sys.stdout) -handler.setLevel(logging.INFO) -formatter = logging.Formatter("[%(levelname)s - %(asctime)s] %(message)s") -handler.setFormatter(formatter) -logger.addHandler(handler) - os.environ["TOKENIZERS_PARALLELISM"] = "1" torch.set_float32_matmul_precision("high") -register_adapter(dict, Json) - - -def parse_arguments(): - """ - Parse command line arguments for the benchmarking CLI. - """ - parser = argparse.ArgumentParser(description="CLI for benchmarking the huggingface/transformers.") - - parser.add_argument( - "branch", - type=str, - help="The branch name on which the benchmarking is performed.", - ) - - parser.add_argument( - "commit_id", - type=str, - help="The commit hash on which the benchmarking is performed.", - ) - parser.add_argument( - "commit_msg", - type=str, - help="The commit message associated with the commit, truncated to 70 characters.", - ) - args = parser.parse_args() - - return args.branch, args.commit_id, args.commit_msg - - -def collect_metrics(benchmark_id, continue_metric_collection): +def collect_metrics(benchmark_id, continue_metric_collection, metrics_recorder): p = psutil.Process(os.getpid()) - conn = psycopg2.connect("dbname=metrics") - cur = conn.cursor() while not continue_metric_collection.is_set(): with p.oneshot(): cpu_util = p.cpu_percent() @@ -73,47 +27,41 @@ def collect_metrics(benchmark_id, continue_metric_collection): gpu_stats = gpustat.GPUStatCollection.new_query() gpu_util = gpu_stats[0]["utilization.gpu"] gpu_mem_megabytes = gpu_stats[0]["memory.used"] - cur.execute( - "INSERT INTO device_measurements (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes) VALUES (%s, %s, %s, %s, %s)", - (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes), + metrics_recorder.collect_device_measurements( + benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes ) sleep(0.01) - conn.commit() - conn.close() -def run_benchmark(branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100): +def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100): continue_metric_collection = Event() metrics_thread = None + model_id = "meta-llama/Llama-2-7b-hf" + metrics_recorder = MetricsRecorder(psycopg2.connect("dbname=metrics"), logger, branch, commit_id, commit_msg) try: gpu_stats = gpustat.GPUStatCollection.new_query() gpu_name = gpu_stats[0]["name"] - conn = psycopg2.connect("dbname=metrics") - cur = conn.cursor() - cur.execute( - "INSERT INTO benchmarks (branch, commit_id, commit_message, gpu_name) VALUES (%s, %s, %s, %s) RETURNING benchmark_id", - (branch, commit_id, commit_msg, gpu_name), + benchmark_id = metrics_recorder.initialise_benchmark({"gpu_name": gpu_name, "model_id": model_id}) + logger.info(f"running benchmark #{benchmark_id} on {gpu_name} for {model_id}") + metrics_thread = Thread( + target=collect_metrics, + args=[benchmark_id, continue_metric_collection, metrics_recorder], ) - conn.commit() - benchmark_id = cur.fetchone()[0] - logger.info(f"running benchmark #{benchmark_id} on {gpu_name}") - metrics_thread = Thread(target=collect_metrics, args=[benchmark_id, continue_metric_collection]) metrics_thread.start() logger.info("started background thread to fetch device metrics") os.environ["TOKENIZERS_PARALLELISM"] = "false" # silence warnings when compiling device = "cuda" - ckpt = "meta-llama/Llama-2-7b-hf" logger.info("downloading weights") # This is to avoid counting download in model load time measurement - model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16) + model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16) gen_config = GenerationConfig(do_sample=False, top_p=1, temperature=1) logger.info("loading model") start = perf_counter() model = AutoModelForCausalLM.from_pretrained( - ckpt, torch_dtype=torch.float16, generation_config=gen_config + model_id, torch_dtype=torch.float16, generation_config=gen_config ).eval() model.to(device) torch.cuda.synchronize() @@ -121,7 +69,7 @@ def run_benchmark(branch: str, commit_id: str, commit_msg: str, num_tokens_to_ge model_load_time = end - start logger.info(f"loaded model in: {model_load_time}s") - tokenizer = AutoTokenizer.from_pretrained(ckpt) + tokenizer = AutoTokenizer.from_pretrained(model_id) prompt = "Why dogs are so cute?" inputs = tokenizer(prompt, return_tensors="pt").to(device) @@ -368,41 +316,27 @@ def decode_one_token(model, cur_token, cache_position, past_key_values): logger.info(f"completed second compile generation in: {fourth_compile_generate_time}s") logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") - cur.execute( - """ - INSERT INTO model_measurements ( - benchmark_id, - measurements - ) VALUES (%s, %s) - """, - ( - benchmark_id, - { - "model_load_time": model_load_time, - "first_eager_forward_pass_time_secs": first_eager_fwd_pass_time, - "second_eager_forward_pass_time_secs": second_eager_fwd_pass_time, - "first_eager_generate_time_secs": first_eager_generate_time, - "second_eager_generate_time_secs": second_eager_generate_time, - "time_to_first_token_secs": time_to_first_token, - "time_to_second_token_secs": time_to_second_token, - "time_to_third_token_secs": time_to_third_token, - "time_to_next_token_mean_secs": mean_time_to_next_token, - "first_compile_generate_time_secs": first_compile_generate_time, - "second_compile_generate_time_secs": second_compile_generate_time, - "third_compile_generate_time_secs": third_compile_generate_time, - "fourth_compile_generate_time_secs": fourth_compile_generate_time, - }, - ), + metrics_recorder.collect_model_measurements( + benchmark_id, + { + "model_load_time": model_load_time, + "first_eager_forward_pass_time_secs": first_eager_fwd_pass_time, + "second_eager_forward_pass_time_secs": second_eager_fwd_pass_time, + "first_eager_generate_time_secs": first_eager_generate_time, + "second_eager_generate_time_secs": second_eager_generate_time, + "time_to_first_token_secs": time_to_first_token, + "time_to_second_token_secs": time_to_second_token, + "time_to_third_token_secs": time_to_third_token, + "time_to_next_token_mean_secs": mean_time_to_next_token, + "first_compile_generate_time_secs": first_compile_generate_time, + "second_compile_generate_time_secs": second_compile_generate_time, + "third_compile_generate_time_secs": third_compile_generate_time, + "fourth_compile_generate_time_secs": fourth_compile_generate_time, + }, ) - conn.commit() - conn.close() except Exception as e: logger.error(f"Caught exception: {e}") continue_metric_collection.set() if metrics_thread is not None: metrics_thread.join() - - -if __name__ == "__main__": - branch, commit_id, commit_msg = parse_arguments() - run_benchmark(branch, commit_id, commit_msg, num_tokens_to_generate=20) + metrics_recorder.close() diff --git a/docker/transformers-pytorch-amd-gpu/Dockerfile b/docker/transformers-pytorch-amd-gpu/Dockerfile index da91906d621429..83f8565c8f467e 100644 --- a/docker/transformers-pytorch-amd-gpu/Dockerfile +++ b/docker/transformers-pytorch-amd-gpu/Dockerfile @@ -1,4 +1,4 @@ -FROM rocm/dev-ubuntu-22.04:6.0.2 +FROM rocm/dev-ubuntu-22.04:6.1 # rocm/pytorch has no version with 2.1.0 LABEL maintainer="Hugging Face" @@ -11,7 +11,7 @@ RUN apt update && \ RUN python3 -m pip install --no-cache-dir --upgrade pip numpy -RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0 +RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1 RUN python3 -m pip install --no-cache-dir --upgrade importlib-metadata setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0" @@ -30,5 +30,5 @@ RUN python3 -m pip uninstall -y tensorflow flax # this line must be added in order for python to be aware of transformers. RUN cd transformers && python3 setup.py develop -# Remove nvml as it is not compatible with ROCm. apex is not tested on NVIDIA either. -RUN python3 -m pip uninstall py3nvml pynvml apex -y +# Remove nvml and nvidia-ml-py as it is not compatible with ROCm. apex is not tested on NVIDIA either. +RUN python3 -m pip uninstall py3nvml pynvml nvidia-ml-py apex -y diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile index 53e66662f9ee99..3cb2acdc53bb1a 100755 --- a/docker/transformers-quantization-latest-gpu/Dockerfile +++ b/docker/transformers-quantization-latest-gpu/Dockerfile @@ -9,7 +9,7 @@ SHELL ["sh", "-lc"] # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant # to be used as arguments for docker build (so far). -ARG PYTORCH='2.4.1' +ARG PYTORCH='2.5.1' # Example: `cu102`, `cu113`, etc. ARG CUDA='cu118' @@ -36,15 +36,23 @@ RUN python3 -m pip install --no-cache-dir einops # Add bitsandbytes for mixed int8 testing RUN python3 -m pip install --no-cache-dir bitsandbytes -# Add auto-gptq for gtpq quantization testing -RUN python3 -m pip install --no-cache-dir auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ +# Add auto-gptq for gtpq quantization testing, installed from source for pytorch==2.5.1 compatibility +# TORCH_CUDA_ARCH_LIST="7.5+PTX" is added to make the package compile for Tesla T4 gpus available for the CI. +RUN pip install gekko +RUN git clone https://github.com/PanQiWei/AutoGPTQ.git && cd AutoGPTQ && TORCH_CUDA_ARCH_LIST="7.5+PTX" python3 setup.py install # Add optimum for gptq quantization testing RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum +# Add PEFT +RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/peft@main#egg=peft + # Add aqlm for quantization testing RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2 +# Add vptq for quantization testing +RUN python3 -m pip install --no-cache-dir vptq + # Add hqq for quantization testing RUN python3 -m pip install --no-cache-dir hqq @@ -52,8 +60,8 @@ RUN python3 -m pip install --no-cache-dir hqq RUN python3 -m pip install --no-cache-dir gguf # Add autoawq for quantization testing -# >=v0.2.3 needed for compatibility with torch 2.2.1 -RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+cu118-cp310-cp310-linux_x86_64.whl +# >=v0.2.7 needed for compatibility with transformers > 4.46 +RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.7.post2/autoawq-0.2.7.post2-py3-none-any.whl # Add quanto for quantization testing RUN python3 -m pip install --no-cache-dir optimum-quanto diff --git a/docs/source/ar/_toctree.yml b/docs/source/ar/_toctree.yml index e66ed3381e2c96..287f4dffbb384e 100644 --- a/docs/source/ar/_toctree.yml +++ b/docs/source/ar/_toctree.yml @@ -119,26 +119,32 @@ title: مشاركة نموذج مخصص - local: chat_templating title: قوالب لنماذج الدردشة -# - local: trainer -# title: المدرب -# - local: sagemaker -# title: تشغيل التدريب على Amazon SageMaker -# - local: serialization -# title: التصدير إلى ONNX + - local: trainer + title: المدرب + - local: sagemaker + title: تشغيل التدريب على Amazon SageMaker + - local: serialization + title: التصدير إلى ONNX - local: tflite title: التصدير إلى TFLite -# - local: torchscript -# title: التصدير إلى TorchScript -# - local: benchmarks -# title: المعايير -# - local: notebooks -# title: دفاتر الملاحظات مع الأمثلة -# - local: community -# title: موارد المجتمع -# - local: troubleshooting -# title: استكشاف الأخطاء وإصلاحها + - local: torchscript + title: التصدير إلى TorchScript + - local: benchmarks + title: المعايير + - local: notebooks + title: دفاتر الملاحظات مع الأمثلة + - local: community + title: موارد المجتمع + - local: troubleshooting + title: استكشاف الأخطاء وإصلاحها - local: gguf title: التوافق مع ملفات GGUF + - local: tiktoken + title: التوافق مع ملفات TikToken + - local: modular_transformers + title: الوحدات النمطية في `transformers` + - local: how_to_hack_models + title: اختراق النموذج (الكتابة فوق فئة لاستخدامك) title: أدلة المطورين # - sections: # - local: quantization/overview @@ -151,6 +157,8 @@ # title: AWQ # - local: quantization/aqlm # title: AQLM +# - local: quantization/vptq +# title: VPTQ # - local: quantization/quanto # title: Quanto # - local: quantization/eetq diff --git a/docs/source/ar/agents.md b/docs/source/ar/agents.md index 92b2a4715f6f07..1213b35008605b 100644 --- a/docs/source/ar/agents.md +++ b/docs/source/ar/agents.md @@ -464,7 +464,7 @@ image = image_generator(prompt=improved_prompt) قبل إنشاء الصورة أخيرًا: - + > [!WARNING] > تتطلب gradio-tools إدخالات وإخراجات *نصية* حتى عند العمل مع طرائق مختلفة مثل كائنات الصور والصوت. الإدخالات والإخراجات الصورية والصوتية غير متوافقة حاليًا. diff --git a/docs/source/ar/benchmarks.md b/docs/source/ar/benchmarks.md new file mode 100644 index 00000000000000..71e1829e643350 --- /dev/null +++ b/docs/source/ar/benchmarks.md @@ -0,0 +1,352 @@ +# معايير الأداء + + +أدوات قياس الأداء من Hugging Face أصبحت قديمة،ويُنصح باستخدام مكتبات خارجية لقياس سرعة وتعقيد الذاكرة لنماذج Transformer. + + + +[[open-in-colab]] + +لنلق نظرة على كيفية تقييم أداء نماذج 🤗 Transformers، وأفضل الممارسات، ومعايير الأداء المتاحة بالفعل. + +يُمكن العثور على دفتر ملاحظات يشرح بالتفصيل كيفية قياس أداء نماذج 🤗 Transformers [هنا](https://github.com/huggingface/notebooks/tree/main/examples/benchmark.ipynb). + +## كيفية قياس أداء نماذج 🤗 Transformers + +تسمح الفئتان [`PyTorchBenchmark`] و [`TensorFlowBenchmark`] بتقييم أداء نماذج 🤗 Transformers بمرونة. تتيح لنا فئات التقييم قياس الأداء قياس _الاستخدام الأقصى للذاكرة_ و _الوقت اللازم_ لكل من _الاستدلال_ و _التدريب_. + + + +هنا، ييُعرَّف _الاستدلال_ بأنه تمريرة أمامية واحدة، ويتم تعريف _التدريب_ بأنه تمريرة أمامية واحدة وتمريرة خلفية واحدة. + + + +تتوقع فئات تقييم الأداء [`PyTorchBenchmark`] و [`TensorFlowBenchmark`] كائنًا من النوع [`PyTorchBenchmarkArguments`] و [`TensorFlowBenchmarkArguments`]، على التوالي، للتنفيذ. [`PyTorchBenchmarkArguments`] و [`TensorFlowBenchmarkArguments`] هي فئات بيانات وتحتوي على جميع التكوينات ذات الصلة لفئة تقييم الأداء المقابلة. في المثال التالي، يتم توضيح كيفية تقييم أداء نموذج BERT من النوع _bert-base-cased_. + + + + +```py +>>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments + +>>> args = PyTorchBenchmarkArguments(models=["google-bert/bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]) +>>> benchmark = PyTorchBenchmark(args) +``` + + + +```py +>>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments + +>>> args = TensorFlowBenchmarkArguments( +... models=["google-bert/bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] +... ) +>>> benchmark = TensorFlowBenchmark(args) +``` + + + +هنا، يتم تمرير ثلاثة معامﻻت إلى فئات بيانات حجة قياس الأداء، وهي `models` و `batch_sizes` و `sequence_lengths`. المعامل `models` مطلوبة وتتوقع `قائمة` من بمعرّفات النموذج من [مركز النماذج](https://huggingface.co/models) تحدد معامﻻت القائمة `batch_sizes` و `sequence_lengths` حجم `input_ids` الذي يتم قياس أداء النموذج عليه. هناك العديد من المعلمات الأخرى التي يمكن تكوينها عبر فئات بيانات معال قياس الأداء. لمزيد من التفاصيل حول هذه المعلمات، يمكنك إما الرجوع مباشرة إلى الملفات `src/transformers/benchmark/benchmark_args_utils.py`، `src/transformers/benchmark/benchmark_args.py` (لـ PyTorch) و `src/transformers/benchmark/benchmark_args_tf.py` (لـ Tensorflow). أو، بدلاً من ذلك، قم بتشغيل أوامر shell التالية من المجلد الرئيسي لطباعة قائمة وصفية بجميع المعلمات القابلة للتكوين لـ PyTorch و Tensorflow على التوالي. + + + + +```bash +python examples/pytorch/benchmarking/run_benchmark.py --help +``` + +يُمكن ببساطة تشغيل كائن التقييم الذي تم تهيئته عن طريق استدعاء `benchmark.run()`. + +```py +>>> results = benchmark.run() +>>> print(results) +==================== INFERENCE - SPEED - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Time in s +-------------------------------------------------------------------------------- +google-bert/bert-base-uncased 8 8 0.006 +google-bert/bert-base-uncased 8 32 0.006 +google-bert/bert-base-uncased 8 128 0.018 +google-bert/bert-base-uncased 8 512 0.088 +-------------------------------------------------------------------------------- + +==================== INFERENCE - MEMORY - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Memory in MB +-------------------------------------------------------------------------------- +google-bert/bert-base-uncased 8 8 1227 +google-bert/bert-base-uncased 8 32 1281 +google-bert/bert-base-uncased 8 128 1307 +google-bert/bert-base-uncased 8 512 1539 +-------------------------------------------------------------------------------- + +==================== ENVIRONMENT INFORMATION ==================== + +- transformers_version: 2.11.0 +- framework: PyTorch +- use_torchscript: False +- framework_version: 1.4.0 +- python_version: 3.6.10 +- system: Linux +- cpu: x86_64 +- architecture: 64bit +- date: 2020-06-29 +- time: 08:58:43.371351 +- fp16: False +- use_multiprocessing: True +- only_pretrain_model: False +- cpu_ram_mb: 32088 +- use_gpu: True +- num_gpus: 1 +- gpu: TITAN RTX +- gpu_ram_mb: 24217 +- gpu_power_watts: 280.0 +- gpu_performance_state: 2 +- use_tpu: False +``` + + + +```bash +python examples/tensorflow/benchmarking/run_benchmark_tf.py --help +``` + +يُمكن بعد ذلك تشغيل كائن قياس الأداء الذي تم تهيئته عن طريق استدعاء `benchmark.run()`. + +```py +>>> results = benchmark.run() +>>> print(results) +>>> results = benchmark.run() +>>> print(results) +==================== INFERENCE - SPEED - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Time in s +-------------------------------------------------------------------------------- +google-bert/bert-base-uncased 8 8 0.005 +google-bert/bert-base-uncased 8 32 0.008 +google-bert/bert-base-uncased 8 128 0.022 +google-bert/bert-base-uncased 8 512 0.105 +-------------------------------------------------------------------------------- + +==================== INFERENCE - MEMORY - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Memory in MB +-------------------------------------------------------------------------------- +google-bert/bert-base-uncased 8 8 1330 +google-bert/bert-base-uncased 8 32 1330 +google-bert/bert-base-uncased 8 128 1330 +google-bert/bert-base-uncased 8 512 1770 +-------------------------------------------------------------------------------- + +==================== ENVIRONMENT INFORMATION ==================== + +- transformers_version: 202.11.0 +- framework: Tensorflow +- use_xla: False +- framework_version: 2.2.0 +- python_version: 3.6.10 +- system: Linux +- cpu: x86_64 +- architecture: 64bit +- date: 2020-06-29 +- time: 09:26:35.617317 +- fp16: False +- use_multiprocessing: True +- only_pretrain_model: False +- cpu_ram_mb: 32088 +- use_gpu: True +- num_gpus: 1 +- gpu: TITAN RTX +- gpu_ram_mb: 24217 +- gpu_power_watts: 280.0 +- gpu_performance_state: 2 +- use_tpu: False +``` + + + +بشكل افتراضي، يتم تقييم _الوقت_ و _الذاكرة المطلوبة_ لـ _الاستدلال_. في مثال المخرجات أعلاه، يُظهر القسمان الأولان النتيجة المقابلة لـ _وقت الاستدلال_ و _ذاكرة الاستدلال_. بالإضافة إلى ذلك، يتم طباعة جميع المعلومات ذات الصلة حول بيئة الحوسبة، على سبيل المثال نوع وحدة معالجة الرسومات (GPU)، والنظام، وإصدارات المكتبة، وما إلى ذلك، في القسم الثالث تحت _معلومات البيئة_. يمكن حفظ هذه المعلومات بشكل اختياري في ملف _.csv_ عند إضافة المعامل `save_to_csv=True` إلى [`PyTorchBenchmarkArguments`] و [`TensorFlowBenchmarkArguments`] على التوالي. في هذه الحالة، يتم حفظ كل قسم في ملف _.csv_ منفصل. يمكن اختيارًا تحديد مسار كل ملف _.csv_ عبر فئات بيانات معامل قياس الأداء. + +بدلاً من تقييم النماذج المدربة مسبقًا عبر معرّف النموذج، على سبيل المثال `google-bert/bert-base-uncased`، يُمكن للمستخدم بدلاً من ذلك قياس أداء تكوين عشوائي لأي فئة نموذج متاحة. في هذه الحالة، يجب إدراج "قائمة" من التكوينات مع معامل قياس الأداء كما هو موضح أدناه. + + + + +```py +>>> from transformers import PyTorchBenchmark، PyTorchBenchmarkArguments، BertConfig + +>>> args = PyTorchBenchmarkArguments( +... models=["bert-base"، "bert-384-hid"، "bert-6-lay"]، batch_sizes=[8]، sequence_lengths=[8، 32، 128، 512] +... ) +>>> config_base = BertConfig() +>>> config_384_hid = BertConfig(hidden_size=384) +>>> config_6_lay = BertConfig(num_hidden_layers=6) + +>>> benchmark = PyTorchBenchmark(args، configs=[config_base، config_384_hid، config_6_lay]) +>>> benchmark.run() +==================== INFERENCE - SPEED - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Time in s +-------------------------------------------------------------------------------- +bert-base 8 128 0.006 +bert-base 8 512 0.006 +bert-base 8 128 0.018 +bert-base 8 512 0.088 +bert-384-hid 8 8 0.006 +bert-384-hid 8 32 0.006 +bert-384-hid 8 128 0.011 +bert-384-hid 8 512 0.054 +bert-6-lay 8 8 0.003 +bert-6-lay 8 32 0.004 +bert-6-lay 8 128 0.009 +bert-6-lay 8 512 0.044 +-------------------------------------------------------------------------------- + +==================== INFERENCE - MEMORY - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Memory in MB +## نتائج اختبار الأداء + +في هذا القسم، يتم قياس _وقت الاستدلال_ و _الذاكرة المطلوبة_ للاستدلال، لمختلف تكوينات `BertModel`. يتم عرض النتائج في جدول، مع تنسيق مختلف قليلاً لكل من PyTorch و TensorFlow. + +-------------------------------------------------------------------------------- +| اسم النموذج | حجم الدفعة | طول التسلسل | الذاكرة بالميغابايت | +-------------------------------------------------------------------------------- +| bert-base | 8 | 8 | 1277 | +| bert-base | 8 | 32 | 1281 | +| bert-base | 8 | 128 | 1307 | +| bert-base | 8 | 512 | 1539 | +| bert-384-hid | 8 | 8 | 1005 | +| bert-384-hid | 8 | 32 | 1027 | +| bert-384-hid | 8 | 128 | 1035 | +| bert-384-hid | 8 | 512 | 1255 | +| bert-6-lay | 8 | 8 | 1097 | +| bert-6-lay | 8 | 32 | 1101 | +| bert-6-lay | 8 | 128 | 1127 | +| bert-6-lay | 8 | 512 | 1359 | +-------------------------------------------------------------------------------- + +==================== معلومات البيئة ==================== + +- transformers_version: 2.11.0 +- framework: PyTorch +- use_torchscript: False +- framework_version: 1.4.0 +- python_version: 3.6.10 +- system: Linux +- cpu: x86_64 +- architecture: 64bit +- date: 2020-06-29 +- time: 09:35:25.143267 +- fp16: False +- use_multiprocessing: True +- only_pretrain_model: False +- cpu_ram_mb: 32088 +- use_gpu: True +- num_gpus: 1 +- gpu: TITAN RTX +- gpu_ram_mb: 24217 +- gpu_power_watts: 280.0 +- gpu_performance_state: 2 +- use_tpu: False +``` + + + +```py +>>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments, BertConfig + +>>> args = TensorFlowBenchmarkArguments( +... models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] +... ) +>>> config_base = BertConfig() +>>> config_384_hid = BertConfig(hidden_size=384) +>>> config_6_lay = BertConfig(num_hidden_layers=6) + +>>> benchmark = TensorFlowBenchmark(args, configs=[config_base, config_384_hid, config_6_lay]) +>>> benchmark.run() +==================== نتائج السرعة في الاستدلال ==================== +-------------------------------------------------------------------------------- +| اسم النموذج | حجم الدفعة | طول التسلسل | الوقت بالثانية | +-------------------------------------------------------------------------------- +| bert-base | 8 | 8 | 0.005 | +| bert-base | 8 | 32 | 0.008 | +| bert-base | 8 | 128 | 0.022 | +| bert-base | 8 | 512 | 0.106 | +| bert-384-hid | 8 | 8 | 0.005 | +| bert-384-hid | 8 | 32 | 0.007 | +| bert-384-hid | 8 | 128 | 0.018 | +| bert-384-hid | 8 | 512 | 0.064 | +| bert-6-lay | 8 | 8 | 0.002 | +| bert-6-lay | 8 | 32 | 0.003 | +| bert-6-lay | 8 | 128 | 0.0011 | +| bert-6-lay | 8 | 512 | 0.074 | +-------------------------------------------------------------------------------- + +==================== نتائج الذاكرة في الاستدلال ==================== +-------------------------------------------------------------------------------- +| اسم النموذج | حجم الدفعة | طول التسلسل | الذاكرة بالميغابايت | +-------------------------------------------------------------------------------- +| اسم النموذج | حجم الدفعة | طول التسلسل | الذاكرة بالميغابايت | +-------------------------------------------------------------------------------- +| bert-base | 8 | 8 | 1330 | +| bert-base | 8 | 32 | 1330 | +| bert-base | 8 | 128 | 1330 | +| bert-base | 8 | 512 | 1770 | +| bert-384-hid | 8 | 8 | 1330 | +| bert-384-hid | 8 | 32 | 1330 | +| bert-384-hid | 8 | 128 | 1330 | +| bert-384-hid | 8 | 512 | 1540 | +| bert-6-lay | 8 | 8 | 1330 | +| bert-6-lay | 8 | 32 | 1330 | +| bert-6-lay | 8 | 128 | 1330 | +| bert-6-lay | 8 | 512 | 1540 | +-------------------------------------------------------------------------------- + +==================== معلومات البيئة ==================== + +- transformers_version: 2.11.0 +- framework: Tensorflow +- use_xla: False +- framework_version: 2.2.0 +- python_version: 3.6.10 +- system: Linux +- cpu: x86_64 +- architecture: 64bit +- date: 2020-06-29 +- time: 09:38:15.487125 +- fp16: False +- use_multiprocessing: True +- only_pretrain_model: False +- cpu_ram_mb: 32088 +- use_gpu: True +- num_gpus: 1 +- gpu: TITAN RTX +- gpu_ram_mb: 24217 +- gpu_power_watts: 280.0 +- gpu_performance_state: 2 +- use_tpu: False +``` + + + +مرة أخرى، يتم قياس _وقت الاستدلال_ و _الذاكرة المطلوبة_ للاستدلال، ولكن هذه المرة لتكوينات مخصصة لـ `BertModel`. يمكن أن تكون هذه الميزة مفيدة بشكل خاص عند اتخاذ قرار بشأن التكوين الذي يجب تدريب النموذج عليه. + +## أفضل الممارسات في اختبار الأداء + +يسرد هذا القسم بعض أفضل الممارسات التي يجب مراعاتها عند إجراء اختبار الأداء لنموذج ما. + +- حالياً، يتم دعم اختبار الأداء على جهاز واحد فقط. عند إجراء الاختبار على وحدة معالجة الرسوميات (GPU)، يوصى بأن يقوم المستخدم بتحديد الجهاز الذي يجب تشغيل التعليمات البرمجية عليه من خلال تعيين متغير البيئة `CUDA_VISIBLE_DEVICES` في الشل، على سبيل المثال `export CUDA_VISIBLE_DEVICES=0` قبل تشغيل التعليمات البرمجية. +- يجب تعيين الخيار `no_multi_processing` إلى `True` فقط لأغراض الاختبار والتصحيح. ولضمان قياس الذاكرة بدقة، يوصى بتشغيل كل اختبار ذاكرة في عملية منفصلة والتأكد من تعيين `no_multi_processing` إلى `True`. +- يجب دائمًا ذكر معلومات البيئة عند مشاركة نتائج تقييم النموذج. يُمكن أن تختلف النتائج اختلافًا كبيرًا بين أجهزة GPU المختلفة وإصدارات المكتبات، وما إلى ذلك، لذلك فإن نتائج الاختبار بمفردها ليست مفيدة جدًا للمجتمع. + +## مشاركة نتائج اختبار الأداء الخاص بك + +في السابق، تم إجراء اختبار الأداء لجميع النماذج الأساسية المتاحة (10 في ذلك الوقت) لقياس _وقت الاستدلال_، عبر العديد من الإعدادات المختلفة: باستخدام PyTorch، مع TorchScript وبدونها، باستخدام TensorFlow، مع XLA وبدونه. تم إجراء جميع هذه الاختبارات على وحدات المعالجة المركزية (CPU) (باستثناء XLA TensorFlow) ووحدات معالجة الرسوميات (GPU). + +يتم شرح هذا النهج بالتفصيل في [منشور المدونة هذا](https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2) وتتوفر النتائج [هنا](https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing). + +مع أدوات اختبار الأداء الجديدة، أصبح من الأسهل من أي وقت مضى مشاركة نتائج اختبار الأداء الخاص بك مع المجتمع: + +- [نتائج اختبار الأداء في PyTorch](https://github.com/huggingface/transformers/tree/main/examples/pytorch/benchmarking/README.md). +- [نتائج اختبار الأداء في TensorFlow](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/benchmarking/README.md). diff --git a/docs/source/ar/community.md b/docs/source/ar/community.md new file mode 100644 index 00000000000000..5a1c31de0aaa3f --- /dev/null +++ b/docs/source/ar/community.md @@ -0,0 +1,66 @@ +# مجتمع المطورين + +هذه الصفحة تجمع الموارد حول 🤗 Transformers التي طورها المجتمع. + +## موارد المجتمع: + +| المصدر | الوصف | المؤلف | +|:----------|:-------------|------:| +| [Hugging Face Transformers Glossary Flashcards](https://www.darigovresearch.com/huggingface-transformers-glossary-flashcards) | مجموعة من البطاقات التعليمية القائمة على [Transformers Docs Glossary](glossary) والتي تم وضعها في شكل يمكن تعلمه/مراجعته بسهولة باستخدام [Anki](https://apps.ankiweb.net/) وهو تطبيق مفتوح المصدر متعدد المنصات مصمم خصيصًا للاحتفاظ بالمعرفة على المدى الطويل. شاهد هذا [فيديو تمهيدي حول كيفية استخدام البطاقات التعليمية](https://www.youtube.com/watch?v=Dji_7PILrw). | [Darigov Research](https://www.darigovresearch.com/) | + +## دفاتر ملاحظات المجتمع: + +| الدفتر | الوصف | المؤلف | | +|:----------|:-------------|:-------------|------:| +| [Fine-tune a pre-trained Transformer to generate lyrics](https://github.com/AlekseyKorshuk/huggingartists) | كيفية توليد كلمات الأغاني على غرار فنانك المفضل من خلال ضبط نموذج GPT-2 | [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) | +| [Train T5 in Tensorflow 2](https://github.com/snapthat/TF-T5-text-to-text) | كيفية تدريب T5 لأي مهمة باستخدام Tensorflow 2. يوضح هذا الدفتر مهمة السؤال والجواب المنفذة في Tensorflow 2 باستخدام SQUAD | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) | +| [Train T5 on TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb) | كيفية تدريب T5 على SQUAD مع Transformers و Nlp | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) | +| [Fine-tune T5 for Classification and Multiple Choice](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | كيفية ضبط نموذج T5 للتصنيف والمهام متعددة الخيارات باستخدام تنسيق النص إلى نص مع PyTorch Lightning | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | +| [Fine-tune DialoGPT on New Datasets and Languages](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | كيفية ضبط نموذج DialoGPT على مجموعة بيانات جديدة لروبوتات الدردشة المحادثية المفتوحة | [Nathan Cooper](https://github.com/ncoop57) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | +| [Long Sequence Modeling with Reformer](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) | كيفية التدريب على تسلسلات طويلة تصل إلى 500,000 رمز باستخدام Reformer | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) | +| [Fine-tune BART for Summarization](https://github.com/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) | كيفية ضبط نموذج BART للتلخيص باستخدام fastai باستخدام blurr | [Wayde Gilliam](https://ohmeow.com/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) | +| [Fine-tune a pre-trained Transformer on anyone's tweets](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | كيفية توليد تغريدات على غرار حساب Twitter المفضل لديك من خلال ضبط نموذج GPT-2 | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | +| [Optimize 🤗 Hugging Face models with Weights & Biases](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | دليل كامل لعرض تكامل W&B مع Hugging Face | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | +| [Pretrain Longformer](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) | كيفية بناء نسخة "طويلة" من النماذج المسبقة التدريب الموجودة | [Iz Beltagy](https://beltagy.net) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) | +| [Fine-tune Longformer for QA](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | كيفية ضبط نموذج Longformer لمهمة QA | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | +| [Evaluate Model with 🤗nlp](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | كيفية تقييم نموذج Longformer على TriviaQA مع `nlp` | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) | +| [Fine-tune T5 for Sentiment Span Extraction](https://github.com/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) | كيفية ضبط نموذج T5 لاستخراج المشاعر باستخدام تنسيق النص إلى نص مع PyTorch Lightning | [Lorenzo Ampil](https://github.com/enzoampil) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) | +| [Fine-tune DistilBert for Multiclass Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb) | كيفية ضبط نموذج DistilBert للتصنيف متعدد الفئات باستخدام PyTorch | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb)| +|[Fine-tune BERT for Multi-label Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|كيفية ضبط نموذج BERT للتصنيف متعدد التصنيفات باستخدام PyTorch|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)| +|[Fine-tune T5 for Summarization](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|كيفية ضبط نموذج T5 للتلخيص في PyTorch وتتبع التجارب باستخدام WandB|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)| +|[Speed up Fine-Tuning in Transformers with Dynamic Padding / Bucketing](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)|كيفية تسريع الضبط الدقيق بعامل 2 باستخدام الضبط الديناميكي/التقسيم|[Michael Benesty](https://github.com/pommedeterresautee) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)| +|[Pretrain Reformer for Masked Language Modeling](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| كيفية تدريب نموذج Reformer مع طبقات الانتباه ثنائية الاتجاه | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)| +|[Expand and Fine Tune Sci-BERT](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| كيفية زيادة مفردات نموذج SciBERT المسبق التدريب من AllenAI على مجموعة بيانات CORD وإنشاء خط أنابيب لها. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)| +|[Fine Tune BlenderBotSmall for Summarization using the Trainer API](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/fine-tune-blenderbot_small-for-summarization.ipynb)| كيفية ضبط نموذج BlenderBotSmall للتلخيص على مجموعة بيانات مخصصة، باستخدام واجهة برمجة التطبيقات Trainer. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Wmupuls7mykSGyRN_Qo6lPQhgp56ymq?usp=sharing)| +|[Fine-tune Electra and interpret with Integrated Gradients](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | كيفية ضبط نموذج Electra للتحليل العاطفي وتفسير التنبؤات باستخدام Captum Integrated Gradients | [Eliza Szczechla](https://elsanns.github.io) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)| +|[fine-tune a non-English GPT-2 Model with Trainer class](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | كيفية ضبط نموذج GPT-2 غير الإنجليزي باستخدام فئة Trainer | [Philipp Schmid](https://www.philschmid.de) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)| +|[Fine-tune a DistilBERT Model for Multi Label Classification task](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | كيفية ضبط نموذج DistilBERT لمهمة التصنيف متعدد التصنيفات | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)| +|[Fine-tune ALBERT for sentence-pair classification](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | كيفية ضبط نموذج ALBERT أو أي نموذج آخر قائم على BERT لمهمة التصنيف المزدوج للجمل | [Nadir El Manouzi](https://github.com/NadirEM) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)| +|[Fine-tune Roberta for sentiment analysis](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | كيفية ضبط نموذج Roberta للتحليل العاطفي | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)| +|[Evaluating Question Generation Models](https://github.com/flexudy-pipe/qugeev) | ما مدى دقة الإجابات على الأسئلة التي يولدها نموذجك التحويلي seq2seq؟ | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)| +|[Classify text with DistilBERT and Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | كيفية ضبط نموذج DistilBERT للتصنيف النصي في TensorFlow | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)| +|[Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | كيفية البدء السريع لنموذج *EncoderDecoderModel* مع نقطة تفتيش *google-bert/bert-base-uncased* للتلخيص على CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)| +|[Leverage RoBERTa for Encoder-Decoder Summarization on BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | كيفية البدء السريع لنموذج *EncoderDecoderModel* المشترك مع نقطة تفتيش *FacebookAI/roberta-base* للتلخيص على BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)| +|[Fine-tune TAPAS on Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | كيفية ضبط نموذج *TapasForQuestionAnswering* مع نقطة تفتيش *tapas-base* على مجموعة بيانات Sequential Question Answering (SQA) | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)| +|[Evaluate TAPAS on Table Fact Checking (TabFact)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | كيفية تقييم نموذج *TapasForSequenceClassification* المضبوط مسبقًا مع نقطة تفتيش *tapas-base-finetuned-tabfact* باستخدام مزيج من مكتبتي 🤗 datasets و 🤗 transformers | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)| +|[Fine-tuning mBART for translation](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | كيفية ضبط نموذج mBART باستخدام Seq2SeqTrainer للترجمة من الهندية إلى الإنجليزية | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)| +|[Fine-tune LayoutLM on FUNSD (a form understanding dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb) | كيفية ضبط نموذج *LayoutLMForTokenClassification* على مجموعة بيانات FUNSD لاستخراج المعلومات من المستندات الممسوحة ضوئيًا | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb)| +|[Fine-Tune DistilGPT2 and Generate Text](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb) | كيفية ضبط نموذج DistilGPT2 وتوليد النص | [Aakash Tripathi](https://github.com/tripathiaakash) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb)| +|[Fine-Tune LED on up to 8K tokens](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | كيفية ضبط نموذج LED على pubmed للتلخيص طويل المدى | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)| +|[Evaluate LED on Arxiv](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | كيفية تقييم نموذج LED للتلخيص طويل المدى بشكل فعال | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)| +|[Fine-tune LayoutLM on RVL-CDIP (a document image classification dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | كيفية ضبط نموذج *LayoutLMForSequenceClassification* على مجموعة بيانات RVL-CDIP لتصنيف المستندات الممسوحة ضوئيًا | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)| +|[Wav2Vec2 CTC decoding with GPT2 adjustment](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | كيفية فك تشفير تسلسل CTC مع تعديل نموذج اللغة | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_zQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)| +|[Fine-tune BART for summarization in two languages with Trainer class](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | كيفية ضبط نموذج BART للتلخيص بلغتين باستخدام فئة Trainer | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)| +|[Evaluate Big Bird on Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | كيفية تقييم نموذج BigBird للأسئلة والأجوبة على وثائق طويلة على Trivia QA | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)| +| [Create video captions using Wav2Vec2](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | كيفية إنشاء تعليقات توضيحية على YouTube من أي فيديو من خلال تفريغ الصوت باستخدام Wav2Vec | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | +| [Fine-tune the Vision Transformer on CIFAR-10 using PyTorch Lightning](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | كيفية ضبط نموذج Vision Transformer (ViT) على CIFAR-10 باستخدام مكتبات HuggingFace Transformers و Datasets و PyTorch Lightning | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | +| [Fine-tune the Vision Transformer on CIFAR-10 using the 🤗 Trainer](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | كيفية ضبط نموذج Vision Transformer (ViT) على CIFAR-10 باستخدام مكتبات HuggingFace Transformers و Datasets و 🤗 Trainer | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | +| [Evaluate LUKE on Open Entity, an entity typing dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | كيفية تقييم نموذج *LukeForEntityClassification* على مجموعة بيانات Open Entity | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | +| [Evaluate LUKE on TACRED, a relation extraction dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | كيفية تقييم نموذج *LukeForEntityPairClassification* على مجموعة بيانات TACRED | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | +| [Evaluate LUKE on CoNLL-2003, an important NER benchmark](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | كيفية تقييم نموذج *LukeForEntitySpanClassification* على مجموعة بيانات CoNLL-2003 | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | +| [Evaluate BigBird-Pegasus on PubMed dataset](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | كيفية تقييم نموذج *BigBirdPegasusForConditionalGeneration* على مجموعة بيانات PubMed | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | +| [Speech Emotion Classification with Wav2Vec2](https://github.com/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | كيفية استخدام نموذج Wav2Vec2 المسبق التدريب لتصنيف المشاعر على مجموعة بيانات MEGA | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | +| [Detect objects in an image with DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | كيفية استخدام نموذج *DetrForObjectDetection* المدرب للكشف عن الأجسام في صورة وتصوير الانتباه | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | +| [Fine-tune DETR on a custom object detection dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | كيفية ضبط نموذج *DetrForObjectDetection* على مجموعة بيانات الكشف عن الأجسام المخصصة | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | +| [Finetune T5 for Named Entity Recognition](https://github.com/ToluClassics/Notebooks/blob/main/T5_Ner_Finetuning.ipynb) | كيفية ضبط نموذج *T5* على مهمة التعرف على الكيانات المسماة | [Ogundepo Odunayo](https://github.com/ToluClassics) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing) | +| [Fine-Tuning Open-Source LLM using QLoRA with MLflow and PEFT](https://github.com/mlflow/mlflow/blob/master/docs/source/llms/transformers/tutorials/fine-tuning/transformers-peft.ipynb) | كيفية استخدام [QLoRA](https://github.com/artidoro/qlora) و [PEFT](https://huggingface.co/docs/peft/en/index) لضبط نموذج LLM بطريقة فعالة من حيث الذاكرة، مع استخدام [MLflow](https://mlflow.org/docs/latest/llms/transformers/index.html) لإدارة تتبع التجارب | [Yuki Watanabe](https://github.com/B-Step62) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mlflow/mlflow/blob/master/docs/source/llms/transformers/tutorials/fine-tuning/transformers-peft.ipynb) | diff --git a/docs/source/ar/how_to_hack_models.md b/docs/source/ar/how_to_hack_models.md new file mode 100644 index 00000000000000..8ce3589732f06a --- /dev/null +++ b/docs/source/ar/how_to_hack_models.md @@ -0,0 +1,163 @@ +# كيفية تعديل أي نموذج من نماذج Transformers + +توفر مكتبة [🤗 Transformers](https://github.com/huggingface/transformers) مجموعة من النماذج المسبقة التدريب والأدوات لمعالجة اللغات الطبيعية، والرؤية، وما إلى ذلك. على الرغم من أن هذه النماذج تغطي مجموعة واسعة من التطبيقات، فقد تواجه حالات استخدام لا تدعمها المكتبة بشكل افتراضي. يُمكن للتخصيص أن يفتح إمكانيات جديدة، مثل إضافة طبقات جديدة، أو تعديل البنية المعمارية، أو تحسين آليات الانتباه. سيُوضح لك هذا الدليل كيفية تعديل نماذج Transformers الموجودة لتلبية احتياجاتك المحددة. الشيء الرائع هو أنك لست بحاجة إلى الخروج من إطار عمل Transformers لإجراء هذه التغييرات. ي يمكنك تعديل النماذج مباشرةً في Transformers والاستفادة من الميزات مثل [واجهة برمجة التطبيقات Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer)، و [PreTrainedModel](https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel)، والضبط الدقيق الفعال باستخدام أدوات مثل [PEFT](https://huggingface.co/docs/peft/index). + +سنرشدك في هذا الدليل لكيفية تخصيص نماذج Transformers الموجودة لتلبية متطلباتك، دون فقدان مزايا الإطار. ستتعلم كيفية: + +- تعديل بنية نموذج ما من خلال تغيير آلية الانتباه الخاصة به. +- تطبيق تقنيات مثل Low-Rank Adaptation (LoRA) على مكونات نموذج محددة. + +نحن نشجعك على المساهمة باختراقاتك الخاصة ومشاركتها هنا مع المجتمع1 + +## مثال: تعديل آلية الانتباه في نموذج Segment Anything (SAM) + +نموذج **Segment Anything (SAM)** هو نموذج رائد في مجال تجزئة الصور. في تنفيذه الافتراضي، يستخدم SAM إسقاطًا مجمعًا للاستعلام والمفتاح والقيمة (`qkv`) في آلية الانتباه الخاصة به. ومع ذلك، قد ترغب في ضبط مكونات محددة فقط من آلية الانتباه، مثل إسقاطات الاستعلام (`q`) والقيمة (`v`)، لتقليل عدد المعلمات القابلة للتدريب والموارد الحسابية المطلوبة. + +### الدافع + +من خلال تقسيم الإسقاط المجمع `qkv` إلى إسقاطات منفصلة `q` و `k` و `v`، يمكنك تطبيق تقنيات مثل **LoRA** (Low-Rank Adaptation) على إسقاطي `q` و `v` فقط. يسمح لك هذا بما يلي: + +- ضبط عدد أقل من المعلمات، مما يقلل من العبء الحسابي. +- تحقيق أداء أفضل من خلال التركيز على مكونات محددة. +- تجربة استراتيجيات تعديل مختلفة في آلية الانتباه. + +### التنفيذ + +#### **الخطوة 1: إنشاء فئة اهتمام مخصصة** + +بعد ذلك، قم بإنشاء فئة فرعية من فئة `SamVisionAttention` الأصلية وعدلها لتضم إسقاطات `q` و `k` و `v` منفصلة. + +```python +import torch +import torch.nn as nn +from transformers.models.sam.modeling_sam import SamVisionAttention + +class SamVisionAttentionSplit(SamVisionAttention, nn.Module): + def __init__(self, config, window_size): + super().__init__(config, window_size) + del self.qkv + # إسقاطات منفصلة q و k و v + self.q = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias) + self.k = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias) + self.v = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias) + self._register_load_state_dict_pre_hook(self.split_q_k_v_load_hook) + + def split_q_k_v_load_hook(self, state_dict, prefix, *args): + keys_to_delete = [] + for key in list(state_dict.keys()): + if "qkv." in key: + # تقسيم q و k و v من الإسقاط المجمع + q, k, v = state_dict[key].chunk(3, dim=0) + # استبدال الإسقاطات الفردية q و k و v + state_dict[key.replace("qkv.", "q.")] = q + state_dict[key.replace("qkv.", "k.")] = k + state_dict[key.replace("qkv.", "v.")] = v + # وضع علامة على مفتاح qkv القديم للحذف + keys_to_delete.append(key) + + # حذف مفاتيح qkv القديمة + for key in keys_to_delete: + del state_dict[key] + + def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch.Tensor: + batch_size, height, width, _ = hidden_states.shape + qkv_shapes = (batch_size * self.num_attention_heads, height * width, -1) + query = self.q(hidden_states).reshape((batch_size, height * width,self.num_attention_heads, -1)).permute(0,2,1,3).reshape(qkv_shapes) + key = self.k(hidden_states).reshape((batch_size, height * width,self.num_attention_heads, -1)).permute(0,2,1,3).reshape(qkv_shapes) + value = self.v(hidden_states).reshape((batch_size, height * width,self.num_attention_heads, -1)).permute(0,2,1,3).reshape(qkv_shapes) + + attn_weights = (query * self.scale) @ key.transpose(-2, -1) + + if self.use_rel_pos: + attn_weights = self.add_decomposed_rel_pos( + attn_weights, query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width) + ) + + attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype) + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + attn_output = (attn_probs @ value).reshape(batch_size, self.num_attention_heads, height, width, -1) + attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, height, width, -1) + attn_output = self.proj(attn_output) + + if output_attentions: + outputs = (attn_output, attn_weights) + else: + outputs = (attn_output, None) + return outputs +``` + +**الشرح:** + +- **الإسقاطات المنفصلة:** يتم إزالة الإسقاط المُجمع `qkv`، وإنشاء إسقاطات خطية منفصلة `q` و `k` و `v`. +- **دالة استدعاء تحميل الأوزان:** تقوم طريقة `_split_qkv_load_hook` بتقسيم أوزان `qkv` المسبقة التدريب إلى أوزان `q` و `k` و `v` منفصلة عند تحميل النموذج. يضمن هذا التوافق مع أي نموذج مسبق التدريب. +- **التنفيذ الأمامي:** يتم حساب الاستعلامات والمفاتيح والقيم بشكل منفصل، وتستمر آلية الانتباه كالمعتاد. + +#### **الخطوة 2: استبدال فئة الانتباه الأصلية** + +استبدل فئة `SamVisionAttention` الأصلية بفئتك المخصصة بحيث يستخدم النموذج آلية الانتباه المعدلة. + +```python +from transformers import SamModel +from transformers.models.sam import modeling_sam + +# استبدال فئة الاهتمام في وحدة نمطية modeling_sam +modeling_sam.SamVisionAttention = SamVisionAttentionSplit + +# تحميل نموذج SAM المسبق التدريب +model = SamModel.from_pretrained("facebook/sam-vit-base") +``` + +**الشرح:** + +- **استبدال الفئة:** من خلال تعيين فئتك المخصصة إلى `modeling_sam.SamVisionAttention`، فإن أي حالات من فئة `SamVisionAttention` في النموذج ستستخدم النسخة المعدلة. وبالتالي، عند استدعاء `SamModel`، سيتم استخدام `SamVisionAttentionSplit` المحددة حديثًا. +- **تحميل النموذج:** يتم تحميل النموذج باستخدام `from_pretrained`، ويتم دمج آلية الانتباه المخصصة. + +#### **الخطوة 3: تطبيق LoRA على إسقاطات محددة** + +مع وجود إسقاطات `q` و `k` و `v` منفصلة، يمكنك الآن تطبيق LoRA على مكونات محددة، مثل إسقاطات `q` و `v`. + +```python +from peft import LoraConfig, get_peft_model + +config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q", "v"], # تطبيق LoRA على إسقاطات q و v + lora_dropout=0.1, + task_type="mask-generation" +) + +# تطبيق LoRA على النموذج +model = get_peft_model(model, config) +``` + +**الشرح:** + +- **تكوين LoRA:** تحدد `LoraConfig` المرتبة `r`، وعامل القياس `lora_alpha`، والوحدات المستهدفة (`"q"` و `"v"`)، ومعدل التخلي، ونوع المهمة. +- **تطبيق LoRA:** تقوم دالة `get_peft_model` بتطبيق LoRA على الوحدات المحددة في النموذج. +- **تقليل المعلمات:** من خلال التركيز على `q` و `v`، فإنك تقلل عدد المعلمات القابلة للتدريب، مما يؤدي إلى تسريع التدريب وتقليل استخدام الذاكرة. + +#### **الخطوة 4: التحقق من عدد المعلمات القابلة للتدريب** + +من السهل التحقق من عدد المعلمات القابلة للتدريب ومعرفة تأثير تعديلك. + +```python +model.print_trainable_parameters() +``` + +**الناتج المتوقع:** + +``` +عدد المعلمات القابلة للتدريب: 608,256 || جميع المعلمات: 94,343,728 || نسبة المعلمات القابلة للتدريب: 0.6447 +عدد المعلمات القابلة للتدريب: 912,384 || جميع المعلمات: 94,647,856 || نسبة المعلمات القابلة للتدريب: 0.9640 # مع k +``` + +## المساهمة بابداعاتك الخاصة + +يمكن لتعديل النماذج المسبقة التدريب أن يفتح آفاقًا جديدة للبحث والتطبيق. من خلال فهم وتعديل الآليات الداخلية للنماذج مثل SAM، يمكنك تخصيصها لتلبية احتياجاتك المحددة، وتحسين الأداء، وتجربة أفكار جديدة. + +إذا قمت بتطوير تعديﻻتك الخاصة لنماذج Transformers وترغب في مشاركتها، ففكر في المساهمة في هذه الوثيقة. + +- **إنشاء طلب سحب (Pull Request):** شارك تغييراتك وتحسيناتك في التعليمات البرمجية مباشرة في المستودع. +- **كتابة التوثيق:** قدم تفسيرات وأمثلة واضحة لتعديلاتك. +- **التفاعل مع المجتمع:** ناقش أفكارك واحصل على تعليقات من المطورين والباحثين الآخرين من خلال فتح مشكلة. diff --git a/docs/source/ar/installation.md b/docs/source/ar/installation.md index ac5962ec8589e8..d3bd4c655b6038 100644 --- a/docs/source/ar/installation.md +++ b/docs/source/ar/installation.md @@ -144,7 +144,7 @@ conda install conda-forge::transformers تُحمّل النماذج المُسبقة التدريب وتُخزّن مؤقتًا في: `~/.cache/huggingface/hub`. هذا هو المجلد الافتراضي الذي يُحدده متغير البيئة `TRANSFORMERS_CACHE`. على Windows، يكون دليل ذاكرة التخزين المؤقت الافتراضي هو `C:\Users\username\.cache\huggingface\hub`. يمكنك تغيير متغيرات البيئة shell الموضحة أدناه - حسب الأولوية - لتحديد دليل ذاكرة تخزين مؤقت مختلف: -1. متغير البيئة (افتراضي): `HUGGINGFACE_HUB_CACHE` أو `TRANSFORMERS_CACHE`. +1. متغير البيئة (افتراضي): `HF_HUB_CACHE` أو `TRANSFORMERS_CACHE`. 2. متغير البيئة: `HF_HOME`. 3. متغير البيئة: `XDG_CACHE_HOME` + `/huggingface`. diff --git a/docs/source/ar/model_sharing.md b/docs/source/ar/model_sharing.md index 620261a0c58a3b..b802eb3ef038f0 100644 --- a/docs/source/ar/model_sharing.md +++ b/docs/source/ar/model_sharing.md @@ -28,7 +28,7 @@ picture-in-picture" allowfullscreen> ```py >>> model = AutoModel.from_pretrained( -... "julien-c/EsperBERTo-small", revision="v2.0.1" # اسم العلامة، أو اسم الفرع، أو تجزئة الالتزام +... "julien-c/EsperBERTo-small", revision="4c77982" # اسم العلامة، أو اسم الفرع، أو تجزئة الالتزام ... ) ``` diff --git a/docs/source/ar/modular_transformers.md b/docs/source/ar/modular_transformers.md new file mode 100644 index 00000000000000..b500fec1c92d25 --- /dev/null +++ b/docs/source/ar/modular_transformers.md @@ -0,0 +1,184 @@ +# المحولات النمطية + +مكتبة `transformers` هي إطار عمل ذو فلسفة محدد؛ يتم تعريف فلسفتنا في [الدليل المفاهيمي](./philosophy). + +جوهر هذه الفلسفة يتمثل في مبدأ [نموذج واحد، ملف واحد](https://huggingface.co/blog/transformers-design-philosophy) +في المكتبة. الجانب السلبي لهذا المكون هو تقييده لوراثة واستيراد مكونات الملفات. + +نتيجة لذلك، تتكرر مكونات النموذج عبر العديد من الملفات. يحتوي `transformers` على عدد كبير من طبقات الانتباه، يقارب عدد النماذج، والكثير منها متطابق. يتسبب هذا في تباعد عمليات التنفيذ المستقلة مع تطبيق الإصلاحات والتغييرات. +على أجزاء محددة من التعليمات البرمجية. + +ولمعالجة ذلك، اعتمدنا مفهوم "النسخ" في المكتبة. فبإضافة تعليق يُشير إلى أن التعليمات البرمجية هي نسخة من أخرى، نضمن من خلال أنظمة CI والأوامر المحلية عدم تباعد النسخ. لكن هذه العملية، رغم بساطتها، تُسبب إرهاقاً. كما أنها تزيد العبء على المساهمين، وهو ما نهدف إلى تجاوزه. + +غالباً ما تتطلب مساهمات النماذج إضافة تعليمات برمجية (حوالي 1000 سطر)، ومعالج (حوالي 500 سطر)، واختبارات، ووثائق، إلخ. ونادراً ما تقل مساهمات النماذج عن 3000-5000 سطر من التعليمات البرمجية، معظمها أكواد نمطية. هذا يرفع مستوى المساهمات، + +ونهدف مع المحولات النمطية إلى خفض هذا المستوى إلى حدّ مقبول. + +## ما هو؟ + +تقدم المحولات النمطية مفهوم ملف "نمطي" لمجلد نموذج. يقبل هذا الملف النمطي تعليمات برمجية +غير مقبولة عادة في ملفات النمذجة/المعالجة، حيث يسمح بالاستيراد من نماذج مجاورة وكذلك +الوراثة من الفئات إلى فئات أخرى. + +يعرّف هذا الملف النمطي النماذج والمعالجات وفئة التكوين التي سيتم تعريفها في وحداتهم +المتعلقة. + +وأخيرًا، يقدم هذا الميزة أداة `linter` جديدة والتي ستعمل على "تفكيك" الملف النمطي إلى بنية "نموذج واحد، ملف واحد" +هيكل الدليل. سيتم إنشاء هذه الملفات تلقائيًا في كل مرة يتم فيها تشغيل البرنامج النصي؛ مما يقلل من المساهمات المطلوبة +إلى الملف النمطي، وبالتالي فقط إلى التغييرات بين النموذج المساهم والنماذج الأخرى. + +سيقوم مستخدمو النموذج في النهاية باستيراد واستخدام واجهة الملف الواحد، لذا لا يتوقع حدوث أي تغيير هنا. من خلال القيام بذلك، +نأمل في الجمع بين أفضل ما في العالمين: تمكين المساهمات البسيطة مع الالتزام بفلسفتنا. + +لذلك، هذا بديل لعلامات `# Copied from`، ويمكن توقع انتقال النماذج المساهمة سابقًا إلى +تنسيق المحولات النمطية الجديد في الأشهر المقبلة. + +### التفاصيل + +تُبسط أداة "linter" الوراثة، مُنشئةً جميع الملفات المفردة من الملف النمطي، مع الحفاظ على شفافيتها أمام مستخدمي Python. حاليًا، تُبسط الأداة مستوىً واحدًا من الوراثة + +على سبيل المثال: +- إذا ورثت فئة التكوين من فئة أخرى وأضافت/حذفت معامل، فسيتم إما الإشارة إلى الملف المولد مباشرةً + (في حالة الإضافة) أو إزالته تمامًا (في حالة الحذف). +- إذا ورثت فئة من فئة أخرى، على سبيل المثال: `class GemmaModel(LlamaModel):`، تُستنتج التبعيات تلقائيًا + سيتم استنتاج جميع الوحدات الفرعية تلقائيًا من الفئة الأصلية. +- إذا قمت بتعريف وظائف جديدة في الملف `modular` واستخدمتها داخل الفئات، فستستنتج أداة linter ذلك تلقائيًا + +يجب أن تكون قادرًا على كتابة كل شيء (المجزىء اللغوي، ومُعالِج الصور، والنموذج، والتكوين) في الملف `modular`، وسيتم إنشاء الملفات المُقابلة تلقائيًا. + +### التطبيق + +[TODO] نقدم اختبارًا جديدًا، للتأكد من أن المحتوى المولد يتطابق مع ما هو موجود في `modular_xxxx.py` + +### الأمثلة + +هنا مثال سريع باستخدام BERT و RoBERTa. النموذجان مرتبطان ارتباطًا وثيقًا: يختلف تنفيذهما النموذجي في طبقة تضمين. + +بدلاً من إعادة تعريف النموذج بالكامل، إليك كيف يبدو ملف `modular_roberta.py` لفئات النمذجة والتكوين (لأغراض المثال، يتم تجاهل المجزىء اللغوي في هذا الوقت حيث أنه مختلف جدًا). + +```python +from torch import nn +from ..bert.configuration_bert import BertConfig +from ..bert.modeling_bert import ( + BertModel, + BertEmbeddings, + BertForMaskedLM +) + +# تكوين RoBERTa مطابق لتكوين BERT +class RobertaConfig(BertConfig): + model_type = 'roberta' + +# نعيد تعريف الإضافات هنا لتسليط الضوء على اختلاف معرف الحشو، ونعيد تعريف الإضافات الموضعية +class RobertaEmbeddings(BertEmbeddings): + def __init__(self, config): + super().__init__(config()) + + self.padding_idx = config.pad_token_id + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx + ) + +# نموذج RoBERTa مطابق لنموذج BERT، باستثناء طبقة الإضافات. +# نعيد تعريف الإضافات أعلاه، لذا هنا لا توجد حاجة لعمل إضافي +class RobertaModel(BertModel): + def __init__(self, config): + super().__init__(config) + self.embeddings = RobertaEmbeddings(config) + + +# الرؤوس الآن تحتاج فقط إلى إعادة تعريف النموذج داخل `RobertaModel` الصحيح +class RobertaForMaskedLM(BertForMaskedLM): + def __init__(self, config): + super().__init__(config) + self.model = RobertaModel(config) +``` + +لاحظ أنه إذا لم تستخدم الاعتماد الذي حددته، فستحصل على الخطأ التالي: + +```bash +ValueError: You defined `RobertaEmbeddings` in the modular_roberta.py, it should be used + when you define `BertModel`, as it is one of it's direct dependencies. Make sure + you use it in the `__init__` function. +``` + +بالإضافة إلى ذلك، قد تجد قائمة بالأمثلة هنا: + +## ما هو ليس كذلك + +ليس بديلاً لتعليمات برمجة النمذجة (بعد؟)، وإذا لم يكن نموذجك يعتمد على أي شيء آخر موجود من قبل، فيمكنك إضافة ملف `نمذجة` كالعادة. + + +## الاستخدام المتقدم + +### إزالة السمات والوظائف +لإزالة السمات التي لا تستخدم في نموذجك النمطي، والتي لا تريد رؤيتها في النمذجة المفككة: + +```python +class GemmaModel(LlamaModel): | class GemmaModel(PreTrainedModel): + def __init__(self, config): | def __init__(self, config): + super().__init__(self, eos_token) | super().__init__(config) + del self.embed_tokens | self.padding_idx = config.pad_token_id + | self.vocab_size = config.vocab_size + | + | self.layers = nn.ModuleList( + | [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + | ) + | self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + | self.rotary_emb = LlamaRotaryEmbedding(config=config) + | self.gradient_checkpointing = False + | + | # Initialize weights and apply final processing + | self.post_init() +``` +إذا قمت بالتحقق من `LlamaModel` الأصلي، فستجد `embed_tokens` الذي تمت إزالته هنا (كما هو متوقع!) + +إزالة وظيفة مشابهة، تحتاج فقط إلى كتابتها مع `raise ValueError("")` لمحاكاة السلوك الذي تريده فعليًا عند إزالة وظيفة أصلية في بايثون. + +```python +class GemmaTokenizer(LlamaTokenizer): + ... + + def get_spm_processor(self): + raise AttributeError("Not needed for Gemma") + + def unk_token_length(self): + raise AttributeError("Not needed for Gemma") +``` + +### تعريف وظائف جديدة + +إذا قمت بتعريف وظيفة جديدة في الملف `modular` لاستخدامها داخل فئة، على سبيل المثال + +```python +def my_new_function(*args, **kwargs): + # Do something here + pass + +class GemmaModel(LlamaModel): + def forward(*args, **kwargs): + # Call the function + example = my_new_function(*args, **kwargs) + # continue here +``` + +سيتم نسخ وظيفة `my_new_function` (وبشكل متكرر، أي وظائف أخرى جديدة يتم استدعاؤها في جسمها) تلقائيًا +في الملف الذي يتم استخدامه. + +### استدعاء `super()` +قمنا مؤخرًا بشحن بعض الميزات التي تسمح لك بالانتقال من: +```python +class GemmaTokenizer(LlamaTokenizer, PretrainedTokenizerFast): | class GemmaModel(nn.Module): + def __init__(self, eos_token=""): | def __init__(self): + eos_token = AddedToken(eos_token) | eos_token = AddedToken(eos_token) + PretrainedTokenizerFast.__init__(self, eos_token) | super().__init__(eos_token) +``` +هذا مفيد عندما لا تريد تفكيك استدعاء `super()`، وتريد التمييز بين أي استدعاء super init تقوم به! + +### التسمية الخاصة +ندعم الآن أيضًا حالات خاصة مثل +```python +class GemmaVisionModel(CLIPModel): + pass +``` +حيث اسم فئة `GemmaVision` الخاصة بك ليس هو نفسه `Gemma` النمطي. هذا مفيد للغاية للنماذج المركبة. diff --git a/docs/source/ar/notebooks.md b/docs/source/ar/notebooks.md new file mode 100644 index 00000000000000..0591204d602c7e --- /dev/null +++ b/docs/source/ar/notebooks.md @@ -0,0 +1,141 @@ +# دفاتر ملاحظات 🤗 Transformers + +يمكنك أن تجد هنا قائمة بدفاتر الملاحظات الرسمية التي تقدمها Hugging Face. + +كما نود أن ندرج هنا محتوى مثيرًا للاهتمام تم إنشاؤه بواسطة المجتمع. +إذا كتبت دفتر ملاحظات يستفيد من 🤗 Transformers وتود إدراجه هنا، فيُرجى فتح طلب سحب حتى يمكن تضمينه ضمن دفاتر ملاحظات المجتمع. + + +## دفاتر ملاحظات Hugging Face 🤗 + +### دفاتر ملاحظات التوثيق + +يمكنك فتح أي صفحة من صفحات التوثيق كدفتر ملاحظات في Colab (يوجد زر مباشرة على تلك الصفحات) ولكنها مدرجة هنا أيضًا إذا كنت بحاجة إليها: + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [جولة سريعة في المكتبة](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/quicktour.ipynb) | عرض لمختلف واجهات برمجة التطبيقات في Transformers |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/quicktour.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/en/transformers_doc/quicktour.ipynb)| +| [ملخص المهام](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb) | كيفية تشغيل نماذج مكتبة Transformers مهمة تلو الأخرى |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb)| +| [معالجة البيانات مسبقًا](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb) | كيفية استخدام محلل لغوي لمعالجة بياناتك مسبقًا |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb)| +| [الضبط الدقيق لنموذج مُدرَّب مسبقًا](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb) | كيفية استخدام المدرب لضبط نموذج مُدرَّب مسبقًا بدقة |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)| +| [ملخص للمحللات اللغوية](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb) | الاختلافات بين خوارزمية المحلل اللغوي |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)| +| [النماذج متعددة اللغات](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb) | كيفية استخدام النماذج متعددة اللغات للمكتبة |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)| + + +### أمثلة PyTorch + +#### معالجة اللغة الطبيعية[[pytorch-nlp]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [تدريب محللك اللغوي](https://github.com/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb) | كيفية تدريب واستخدام محللك اللغوي الخاص بك |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)| +| [تدريب نموذج لغتك](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb) | كيفية البدء بسهولة في استخدام المحولات |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف النص](https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على أي مهمة GLUE. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)| +| [كيفية ضبط نموذج بدقة على النمذجة اللغوية](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على مهمة LM سببية أو مقنعة. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف الرموز المميزة](https://github.com/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على مهمة تصنيف الرموز المميزة (NER، PoS). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| +| [كيفية ضبط نموذج بدقة على الإجابة على الأسئلة](https://github.com/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على SQUAD. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| +| [كيفية ضبط نموذج بدقة على الاختيار من متعدد](https://github.com/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على SWAG. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| +| [كيفية ضبط نموذج بدقة على الترجمة](https://github.com/huggingface/notebooks/blob/main/examples/translation.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على WMT. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/translation.ipynb)| +| [كيفية ضبط نموذج بدقة على التلخيص](https://github.com/huggingface/notebooks/blob/main/examples/summarization.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على XSUM. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)| +| [كيفية تدريب نموذج لغة من البداية](https://github.com/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| تسليط الضوء على جميع الخطوات لتدريب نموذج Transformer بشكل فعال على بيانات مخصصة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| +| [كيفية إنشاء نص](https://github.com/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| كيفية استخدام أساليب فك التشفير المختلفة لإنشاء اللغة باستخدام المحولات | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| +| [كيفية إنشاء نص (مع قيود)](https://github.com/huggingface/blog/blob/main/notebooks/53_constrained_beam_search.ipynb)| كيفية توجيه إنشاء اللغة باستخدام القيود التي يوفرها المستخدم | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/53_constrained_beam_search.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/53_constrained_beam_search.ipynb)| +| [Reformer](https://github.com/huggingface/blog/blob/main/notebooks/03_reformer.ipynb)| كيف يدفع Reformer حدود النمذجة اللغوية | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/blog/blob/main/notebooks/03_reformer.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/patrickvonplaten/blog/blob/main/notebooks/03_reformer.ipynb)| + +#### رؤية الكمبيوتر[[pytorch-cv]] + +| دفتر الملاحظات | الوصف | | | +|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------:| +| [كيفية ضبط نموذج بدقة على تصنيف الصور (Torchvision)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification.ipynb) | يوضح كيفية معالجة البيانات مسبقًا باستخدام Torchvision وضبط أي نموذج رؤية مُدرَّب مسبقًا بدقة على تصنيف الصور | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف الصور (Albumentations)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) | يوضح كيفية معالجة البيانات مسبقًا باستخدام Albumentations وضبط أي نموذج رؤية مُدرَّب مسبقًا بدقة على تصنيف الصور | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف الصور (Kornia)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb) | يوضح كيفية معالجة البيانات مسبقًا باستخدام Kornia وضبط أي نموذج رؤية مُدرَّب مسبقًا بدقة على تصنيف الصور | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)| +| [كيفية إجراء الكشف عن الأشياء بدون لقطات مع OWL-ViT](https://github.com/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb) | يوضح كيفية إجراء الكشف عن الأشياء بدون لقطات على الصور باستخدام استعلامات نصية | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)| +| [كيفية ضبط نموذج وصف الصور بدقة](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb) | يوضح كيفية ضبط BLIP بدقة لوصف الصور على مجموعة بيانات مخصصة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb)| +| [كيفية بناء نظام تشابه الصور مع Transformers](https://github.com/huggingface/notebooks/blob/main/examples/image_similarity.ipynb) | يوضح كيفية بناء نظام تشابه الصور | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_similarity.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_similarity.ipynb)| +| [كيفية ضبط نموذج SegFormer بدقة على التجزئة الدلالية](https://github.com/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb) | يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج SegFormer مُدرَّب مسبقًا بدقة على التجزئة الدلالية | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb)| +| [كيفية ضبط نموذج VideoMAE بدقة على تصنيف الفيديو](https://github.com/huggingface/notebooks/blob/main/examples/video_classification.ipynb) | يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج VideoMAE مُدرَّب مسبقًا بدقة على تصنيف الفيديو | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb)| + + +#### الصوت[[pytorch-audio]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [كيفية ضبط نموذج التعرف على الكلام باللغة الإنجليزية بدقة](https://github.com/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج كلام مُدرَّب مسبقًا بدقة على TIMIT | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/speech_recognition.ipynb)| +| [كيفية ضبط نموذج التعرف على الكلام بأي لغة بدقة](https://github.com/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج كلام مُدرَّب مسبقًا متعدد اللغات بدقة على Common Voice | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multi_lingual_speech_recognition.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف الصوت](https://github.com/huggingface/notebooks/blob/main/examples/audio_classification.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج كلام مُدرَّب مسبقًا بدقة على Keyword Spotting | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb)| + + +#### التسلسلات البيولوجية[[pytorch-bio]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:----------------------------------------------------------------------------------------|:-------------|------:| +| [كيفية ضبط نموذج بروتين مُدرَّب مسبقًا بدقة](https://github.com/huggingface/notebooks/blob/main/examples/protein_language_modeling.ipynb) | شاهد كيفية ترميز البروتينات وضبط نموذج "لغة" بروتين مُدرَّب مسبقًا كبير بدقة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_language_modeling.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/protein_language_modeling.ipynb) | +| [كيفية إنشاء طيات بروتينية](https://github.com/huggingface/notebooks/blob/main/examples/protein_folding.ipynb) | شاهد كيفية الانتقال من تسلسل البروتين إلى نموذج بروتين كامل وملف PDB | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_folding.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/protein_folding.ipynb) | +| [كيفية ضبط نموذج محول النيوكليوتيدات بدقة](https://github.com/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) | شاهد كيفية ترميز الحمض النووي وضبط نموذج "لغة" الحمض النووي مُدرَّب مسبقًا كبير بدقة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling.ipynb) | +| [ضبط نموذج محول النيوكليوتيدات بدقة باستخدام LoRA](https://github.com/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) | تدريب نماذج DNA أكبر بكثير بطريقة فعالة من حيث الذاكرة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/nucleotide_transformer_dna_sequence_modelling_with_peft.ipynb) | + + +#### طرائق أخرى[[pytorch-other]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:----------------------------------------------------------------------------------------|:-------------|------:| +| [التنبؤ الاحتمالي بالسلاسل الزمنية](https://github.com/huggingface/notebooks/blob/main/examples/time-series-transformers.ipynb) | شاهد كيفية تدريب Time Series Transformer على مجموعة بيانات مخصصة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/time-series-transformers.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/time-series-transformers.ipynb) | + +#### دفاتر ملاحظات الأدوات المساعدة [[pytorch-utility]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [كيفية تصدير النموذج إلى ONNX](https://github.com/huggingface/notebooks/blob/main/examples/onnx-export.ipynb)| تسليط الضوء على كيفية التصدير وتشغيل أعباء عمل الاستدلال من خلال ONNX | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/onnx-export.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/onnx-export.ipynb)| +| [كيفية استخدام المعايير](https://github.com/huggingface/notebooks/blob/main/examples/benchmark.ipynb)| كيفية قياس أداء النماذج باستخدام المحولات | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/benchmark.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/benchmark.ipynb)| + +### أمثلة TensorFlow + +#### معالجة اللغة الطبيعية[[tensorflow-nlp]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [تدريب محللك اللغوي](https://github.com/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb) | كيفية تدريب واستخدام محللك اللغوي الخاص بك |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)| +| [تدريب نموذج لغتك](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch-tf.ipynb) | كيفية البدء بسهولة في استخدام المحولات |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch-tf.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف النص](https://github.com/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على أي مهمة GLUE. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb)| +| [كيفية ضبط نموذج بدقة على النمذجة اللغوية](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على مهمة LM سببية أو مقنعة. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف الرموز المميزة](https://github.com/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على مهمة تصنيف الرموز المميزة (NER، PoS). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb)| +| [كيفية ضبط نموذج بدقة على الإجابة على الأسئلة](https://github.com/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على SQUAD. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb)| +| [كيفية ضبط نموذج بدقة على الاختيار من متعدد](https://github.com/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على SWAG. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb)| +| [كيفية ضبط نموذج بدقة على الترجمة](https://github.com/huggingface/notebooks/blob/main/examples/translation-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على WMT. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb)| +| [كيفية ضبط نموذج بدقة على التلخيص](https://github.com/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج مُدرَّب مسبقًا بدقة على XSUM. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb)| + +#### رؤية الكمبيوتر[[tensorflow-cv]] + +| دفتر الملاحظات | الوصف | | | +|:---------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------|:-------------|------:| +| [كيفية ضبط نموذج بدقة على تصنيف الصور](https://github.com/huggingface/notebooks/blob/main/examples/image_classification-tf.ipynb) | يوضح كيفية معالجة البيانات مسبقًا وضبط أي نموذج رؤية مُدرَّب مسبقًا بدقة على تصنيف الصور | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification-tf.ipynb)| +| [كيفية ضبط نموذج SegFormer بدقة على التجزئة الدلالية](https://github.com/huggingface/notebooks/blob/main/examples/semantic_segmentation-tf.ipynb) | يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج SegFormer مُدرَّب مسبقًا بدقة على التجزئة الدلالية | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/semantic_segmentation-tf.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/semantic_segmentation-tf.ipynb)| + +#### التسلسلات البيولوجية[[tensorflow-bio]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [كيفية ضبط نموذج بروتين مُدرَّب مسبقًا بدقة](https://github.com/huggingface/notebooks/blob/main/examples/protein_language_modeling-tf.ipynb) | شاهد كيفية ترميز البروتينات وضبط نموذج "لغة" بروتين مُدرَّب مسبقًا كبير بدقة | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/protein_language_modeling-tf.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/protein_language_modeling-tf.ipynb) | + +#### دفاتر ملاحظات الأدوات المساعدة [[tensorflow-utility]] + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [كيفية تدريب نماذج TF/Keras على TPU](https://github.com/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) | شاهد كيفية التدريب بسرعة عالية على أجهزة TPU من Google | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb) | + +### دفاتر ملاحظات Optimum + +🤗 [Optimum](https://github.com/huggingface/optimum) هو امتداد لـ 🤗 Transformers، يوفر مجموعة من أدوات تحسين الأداء التي تمكن من تحقيق أقصى قدر من الكفاءة لتدريب وتشغيل النماذج على الأجهزة المستهدفة. + +| دفتر الملاحظات | الوصف | | | +|:----------|:-------------|:-------------|------:| +| [كيفية تكميم نموذج باستخدام ONNX Runtime لتصنيف النص](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)| يوضح كيفية تطبيق التكميم الثابت والديناميكي على نموذج باستخدام [ONNX Runtime](https://github.com/microsoft/onnxruntime) لأي مهمة GLUE. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)| +| [كيفية تكميم نموذج باستخدام Intel Neural Compressor لتصنيف النص](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_quantization_inc.ipynb)| يوضح كيفية تطبيق التكميم الثابت والديناميكي والتدريبي على نموذج باستخدام [Intel Neural Compressor (INC)](https://github.com/intel/neural-compressor) لأي مهمة GLUE. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_inc.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_inc.ipynb)| +| [كيفية ضبط نموذج بدقة على تصنيف النص باستخدام ONNX Runtime](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج بدقة على أي مهمة GLUE باستخدام [ONNX Runtime](https://github.com/microsoft/onnxruntime). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)| +| [كيفية ضبط نموذج بدقة على التلخيص باستخدام ONNX Runtime](https://github.com/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج بدقة على XSUM باستخدام [ONNX Runtime](https://github.com/microsoft/onnxruntime). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)| + + +## دفاتر ملاحظات المجتمع: + +تتوفر المزيد من دفاتر الملاحظات التي طورها المجتمع [هنا](https://hf.co/docs/transformers/community#community-notebooks). + diff --git a/docs/source/ar/quicktour.md b/docs/source/ar/quicktour.md index 9a99c28287d622..1795c3a5d74fcc 100644 --- a/docs/source/ar/quicktour.md +++ b/docs/source/ar/quicktour.md @@ -347,8 +347,8 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725], ```py >>> from transformers import AutoModel ->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory) ->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True) +>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory) +>>> pt_model = AutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True) ``` @@ -356,8 +356,8 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725], ```py >>> from transformers import TFAutoModel ->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory) ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True) +>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory) +>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True) ``` diff --git a/docs/source/ar/sagemaker.md b/docs/source/ar/sagemaker.md new file mode 100644 index 00000000000000..6bb53816baaaee --- /dev/null +++ b/docs/source/ar/sagemaker.md @@ -0,0 +1,8 @@ +# تشغيل التدريب على Amazon SageMaker + +تم نقل التوثيق إلى [hf.co/docs/sagemaker](https://huggingface.co/docs/sagemaker). وسيتم إزالة هذه الصفحة في الإصدار 5.0 من برنامج Transformers. + +### جدول المحتويات + +- [تدريب نماذج Hugging Face على Amazon SageMaker باستخدام SageMaker Python SDK](https://huggingface.co/docs/sagemaker/train) +- [نشر نماذج Hugging Face على Amazon SageMaker باستخدام SageMaker Python SDK](https://huggingface.co/docs/sagemaker/inference) \ No newline at end of file diff --git a/docs/source/ar/serialization.md b/docs/source/ar/serialization.md new file mode 100644 index 00000000000000..2df620d86239a0 --- /dev/null +++ b/docs/source/ar/serialization.md @@ -0,0 +1,170 @@ +# التصدير إلى ONNX + +غالباً ما يتطلب نشر نماذج 🤗 Transformers في بيئات الإنتاج أو يمكن أن يستفيد من تصدير النماذج إلى تنسيق تسلسلي يُمكن تحميله وتنفيذه على أجهزة وبرامج تشغيل مُتخصصة. + +🤗 Optimum هو امتداد لـ Transformers يمكّن من تصدير النماذج من PyTorch أو TensorFlow إلى تنسيقات مُتسلسلة مثل ONNX و TFLite من خلال وحدة `exporters` الخاصة به. يوفر 🤗 Optimum أيضًا مجموعة من أدوات تحسين الأداء لتدريب النماذج وتشغيلها على أجهزة مستهدفة بكفاءة قصوى. + +يوضح هذا الدليل كيفية تصدير نماذج 🤗 Transformers إلى ONNX باستخدام 🤗 Optimum، وللحصول على الدليل الخاص بتصدير النماذج إلى TFLite، يُرجى الرجوع إلى صفحة [التصدير إلى TFLite](tflite). + +## التصدير إلى ONNX + +مجمد [ONNX (Open Neural Network Exchange)](http://onnx.ai) هو معيار مفتوح يُحدد مجموعة مشتركة من العوامل وتنسيق ملف مشترك لتمثيل نماذج التعلم العميق في مجموعة متنوعة واسعة من الأطر، بما في ذلك PyTorch وTensorFlow. عندما يتم تصدير نموذج إلى تنسيق ONNX، يتم استخدام هذه المشغلات لبناء رسم بياني حاسوبي (يُطلق عليه غالبًا اسم _تمثيل وسيط_) والذي يمثل تدفق البيانات عبر الشبكة العصبية. + +من خلال عرض رسم بياني بعوامل وأنواع بيانات معيارية، يُسهّل ONNX التبديل بين الأطر. على سبيل المثال، يُمكن تصدير نموذج مدرب في PyTorch إلى تنسيق ONNX ثم استيراده في TensorFlow (والعكس صحيح). + +بمجرد التصدير إلى تنسيق ONNX، يُمكن: + +- تحسين النموذج للاستدلال عبر تقنيات مثل [تحسين الرسم البياني](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization) و [التكميم](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/quantization). +- تشغيله باستخدام ONNX Runtime عبر فئات [`ORTModelForXXX`](https://huggingface.co/docs/optimum/onnxruntime/package_reference/modeling_ort)، والتي تتبع نفس واجهة برمجة التطبيقات (API) لـ `AutoModel` التي اعتدت عليها في 🤗 Transformers. +- تشغيله باستخدام [قنوات معالجة الاستدلال مُحسّنة](https://huggingface.co/docs/optimum/main/en/onnxruntime/usage_guides/pipelines)، والتي لها نفس واجهة برمجة التطبيقات (API) مثل وظيفة [`pipeline`] في 🤗 Transformers. + +يوفر 🤗 Optimum دعمًا لتصدير ONNX من خلال الاستفادة من كائنات التكوين. تأتي كائنات التكوين هذه جاهزة لعدد من معماريات النماذج، وقد تم تصميمها لتكون قابلة للتوسعة بسهولة إلى معماريات أخرى. + +للاطلاع على قائمة بالتكوينات الجاهزة، يُرجى الرجوع إلى [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/onnx/overview). + +هناك طريقتان لتصدير نموذج 🤗 Transformers إلى ONNX، نعرض هنا كليهما: + +- التصدير باستخدام 🤗 Optimum عبر واجهة سطر الأوامر (CLI). +- التصدير باستخدام 🤗 Optimum مع `optimum.onnxruntime`. + +### تصدير نموذج 🤗 Transformers إلى ONNX باستخدام واجهة سطر الأوامر + +لتصدير نموذج 🤗 Transformers إلى ONNX، قم أولاً بتثبيت اعتماد إضافي: + +```bash +pip install optimum[exporters] +``` + +للاطلاع على جميع المعامﻻت المتاحة، يرجى الرجوع إلى [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli)، أو عرض المساعدة في سطر الأوامر: + +```bash +optimum-cli export onnx --help +``` +```bash +optimum-cli export onnx --help +``` + +لتصدير نقطة تفتيش نموذج من 🤗 Hub، على سبيل المثال، `distilbert/distilbert-base-uncased-distilled-squad`، قم بتشغيل الأمر التالي: + +```bash +optimum-cli export onnx --model distilbert/distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/ +``` + +يجب أن تشاهد السجلات التي تشير إلى التقدم المحرز وتظهر المكان الذي تم فيه حفظ ملف `model.onnx` الناتج، مثل هذا: + +```bash +Validating ONNX model distilbert_base_uncased_squad_onnx/model.onnx... + -[✓] ONNX model output names match reference model (start_logits, end_logits) + - Validating ONNX Model output "start_logits": + -[✓] (2, 16) matches (2, 16) + -[✓] all values close (atol: 0.0001) + - Validating ONNX Model output "end_logits": + -[✓] (2, 16) matches (2, 16) + -[✓] all values close (atol: 0.0001) +The ONNX export succeeded and the exported model was saved at: distilbert_base_uncased_squad_onnx +``` + +يوضح المثال أعلاه تصدير نقطة تفتيش من 🤗 Hub. عند تصدير نموذج محلي، تأكد أولاً من حفظ ملفات أوزان النموذج ومحول الرموز في نفس الدليل (`local_path`). عند استخدام واجهة سطر الأوامر، قم بتمرير `local_path` إلى وسيط `model` بدلاً من اسم نقطة التفتيش على 🤗 Hub وقدم وسيط `--task`. يمكنك مراجعة قائمة المهام المدعومة في [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/task_manager). إذا لم يتم توفير وسيط `task`، فسيتم تعيينه افتراضيًا إلى هندسة النموذج دون أي رأس محدد للمهمة. + +```bash +optimum-cli export onnx --model local_path --task question-answering distilbert_base_uncased_squad_onnx/ +``` + +يمكن بعد ذلك تشغيل ملف `model.onnx` الناتج على أحد [المسرعات](https://onnx.ai/supported-tools.html#deployModel) العديدة التي تدعم معيار ONNX. على سبيل المثال، يمكننا تحميل النموذج وتشغيله باستخدام [ONNX Runtime](https://onnxruntime.ai/) كما يلي: + +```python +>>> from transformers import AutoTokenizer +>>> from optimum.onnxruntime import ORTModelForQuestionAnswering + +>>> tokenizer = AutoTokenizer.from_pretrained("distilbert_base_uncased_squad_onnx") +>>> model = ORTModelForQuestionAnswering.from_pretrained("distilbert_base_uncased_squad_onnx") +>>> inputs = tokenizer("What am I using?", "Using DistilBERT with ONNX Runtime!", return_tensors="pt") +>>> outputs = model(**inputs) +``` + +تكون العملية مماثلة بالنسبة إلى نقاط تفتيش TensorFlow على Hub. على سبيل المثال، إليك كيفية تصدير نقطة تفتيش TensorFlow نقية من [منظمة Keras](https://huggingface.co/keras-io): + +```bash +optimum-cli export onnx --model keras-io/transformers-qa distilbert_base_cased_squad_onnx/ +``` + +### تصدير نموذج 🤗 Transformers إلى ONNX باستخدام `optimum.onnxruntime` + +كبديل لواجهة سطر الأوامر، يُمكنك تصدير نموذج 🤗 Transformers إلى ONNX برمجيًا كما يلي: + +```python +>>> from optimum.onnxruntime import ORTModelForSequenceClassification +>>> from transformers import AutoTokenizer + +>>> model_checkpoint = "distilbert_base_uncased_squad" +>>> save_directory = "onnx/" + +>>> # تحميل نموذج من transformers وتصديره إلى ONNX +>>> ort_model = ORTModelForSequenceClassification.from_pretrained(model_checkpoint, export=True) +>>> tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) + +>>> # حفظ نموذج onnx ومجزىء النصوص +>>> ort_model.save_pretrained(save_directory) +>>> tokenizer.save_pretrained(save_directory) +``` + +### تصدير نموذج لهندسة غير مدعومة + +إذا كنت ترغب في المساهمة من خلال إضافة دعم لنموذج لا يُمكن تصديره حاليًا، فيجب عليك أولاً التحقق مما إذا كان مدعومًا في [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview)، وإذا لم يكن مدعومًا، [فيمكنك المساهمة في 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute) مُباشرةً. + +### تصدير نموذج باستخدام `transformers.onnx` + + + +لم يعد يتم دعم `tranformers.onnx` يُرجى تصدير النماذج باستخدام 🤗 Optimum كما هو موضح أعلاه. سيتم إزالة هذا القسم في الإصدارات القادمة. + + + +لتصدير نموذج 🤗 Transformers إلى ONNX باستخدام `tranformers.onnx`، ثبّت التبعيات الإضافية: + +```bash +pip install transformers[onnx] +``` + +استخدم حزمة `transformers.onnx` كنموذج Python لتصدير نقطة حفظ باستخدام تكوين جاهز: + +```bash +python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/ +``` + +يُصدّر هذا رسمًا بيانيًا ONNX لنقطة الحفظ المُحددة بواسطة وسيطة `--model`. مرر أي نقطة حفظ على 🤗 Hub أو نقطة حفظ مُخزنة محليًا. +يُمكن بعد ذلك تشغيل ملف `model.onnx` الناتج على أحد المُسرعات العديدة التي تدعم معيار ONNX. على سبيل المثال، قم بتحميل وتشغيل النموذج باستخدام ONNX Runtime كما يلي: + +```python +>>> from transformers import AutoTokenizer +>>> from onnxruntime import InferenceSession + +>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased") +>>> session = InferenceSession("onnx/model.onnx") +>>> # يتوقع ONNX Runtime مصفوفات NumPy كمدخلات +>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np") +>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs)) +``` + +يُمكن الحصول على أسماء المخرجات المطلوبة (مثل `["last_hidden_state"]`) من خلال إلقاء نظرة على تكوين ONNX لكل نموذج. على سبيل المثال، بالنسبة لـ DistilBERT، لدينا: + +```python +>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig + +>>> config = DistilBertConfig() +>>> onnx_config = DistilBertOnnxConfig(config) +>>> print(list(onnx_config.outputs.keys())) +["last_hidden_state"] +``` + +العمليات مُتطابقة لنقاط الحفظ TensorFlow على Hub. على سبيل المثال، صدّر نقطة حفظ TensorFlow خالصة كما يلي: + +```bash +python -m transformers.onnx --model=keras-io/transformers-qa onnx/ +``` + +لتصدير نموذج مُخزن محليًا، احفظ أوزان النموذج ومجزىء اللغوى في نفس الدليل (على سبيل المثال `local-pt-checkpoint`)، ثم قم بتصديره إلى ONNX عن طريق توجيه وسيط `--model` لحزمة `transformers.onnx` إلى الدليل المطلوب: + +```bash +python -m transformers.onnx --model=local-pt-checkpoint onnx/ +``` \ No newline at end of file diff --git a/docs/source/ar/tiktoken.md b/docs/source/ar/tiktoken.md new file mode 100644 index 00000000000000..6f3755d8670cdc --- /dev/null +++ b/docs/source/ar/tiktoken.md @@ -0,0 +1,41 @@ +# Tiktoken والتفاعل مع Transformers + +يتم دمج دعم ملفات نموذج tiktoken بسلاسة في 🤗 transformers عند تحميل النماذج +`from_pretrained` مع ملف `tokenizer.model` tiktoken على Hub، والذي يتم تحويله تلقائيًا إلى [المحلل اللغوي السريع](https://huggingface.co/docs/transformers/main/en/main_classes/tokenizer#transformers.PreTrainedTokenizerFast). + +### النماذج المعروفة التي تم إصدارها مع `tiktoken.model`: + - gpt2 + - llama3 + +## مثال على الاستخدام + +من أجل تحميل ملفات `tiktoken` في `transformers`، تأكد من أن ملف `tokenizer.model` هو ملف tiktoken وسيتم تحميله تلقائيًا عند التحميل `from_pretrained`. إليك كيفية تحميل مجزىء لغوي ونموذج، والذي +يمكن تحميله من نفس الملف بالضبط: + +```py +from transformers import AutoTokenizer + +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" +tokenizer = AutoTokenizer.from_pretrained(model_id, subfolder="original") +``` +## إنشاء مجزىء لغوي tiktoken + +لا يحتوي ملف `tokenizer.model` على أي معلومات حول الرموز أو الأنماط الإضافية. إذا كانت هذه الأمور مهمة، قم بتحويل المحلل اللغوي إلى `tokenizer.json`، وهو التنسيق المناسب لـ [`PreTrainedTokenizerFast`]. + +قم بتوليد ملف `tokenizer.model` باستخدام [tiktoken.get_encoding](https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/registry.py#L63) ثم قم بتحويله إلى `tokenizer.json` باستخدام [`convert_tiktoken_to_fast`]. + +```py + +from transformers.integrations.tiktoken import convert_tiktoken_to_fast +from tiktoken import get_encoding + +# يمكنك تحميل ترميزك المخصص أو الترميز الذي توفره OpenAI +encoding = get_encoding("gpt2") +convert_tiktoken_to_fast(encoding, "config/save/dir") +``` + +يتم حفظ ملف `tokenizer.json` الناتج في الدليل المحدد ويمكن تحميله باستخدام [`PreTrainedTokenizerFast`]. + +```py +tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir") +``` diff --git a/docs/source/ar/torchscript.md b/docs/source/ar/torchscript.md new file mode 100644 index 00000000000000..bf0bc0dde04b62 --- /dev/null +++ b/docs/source/ar/torchscript.md @@ -0,0 +1,154 @@ +# التصدير إلى TorchScript + + + +هذه هي بداية تجاربنا مع TorchScript ولا زلنا نستكشف قدراته مع نماذج المدخلات المتغيرة الحجم. إنه مجال اهتمامنا وسنعمق تحليلنا في الإصدارات القادمة، مع المزيد من الأمثلة البرمجية، وتنفيذ أكثر مرونة، ومقاييس مقارنة بين الأكواد القائمة على Python مع أكواد TorchScript المُجمّعة. + + + +وفقًا لـ [وثائق TorchScript](https://pytorch.org/docs/stable/jit.html): + +> TorchScript هي طريقة لإنشاء نماذج قابلة للتسلسل والتحسين من تعليمات PyTorch البرمجية. + +هناك وحدتان من PyTorch، [JIT and TRACE](https://pytorch.org/docs/stable/jit.html)، تتيحان للمطورين تصدير نماذجهم لإعادة استخدامها في برامج أخرى مثل برامج C++ المُحسّنة للأداء. + +نقدم واجهة تتيح لك تصدير نماذج 🤗 Transformers إلى TorchScript بحيث يمكن إعادة استخدامها في بيئة مختلفة عن برامج Python القائمة إلى PyTorch. هنا نشرح كيفية تصدير نماذجنا واستخدامها باستخدام TorchScript. + +يتطلب تصدير نموذج أمرين: + +- تهيئة مثيل للنموذج باستخدام علامة `torchscript` +- تمرير مُدخلات وهمية (dummy inputs) خلال النموذج + +تنطوي هذه الضرورات على عدة أمور يجب على المطورين توخي الحذر بشأنها كما هو مفصل أدناه. + +## علامة TorchScript والأوزان المرتبطة + +علامة `torchscript` ضرورية لأن معظم نماذج اللغة 🤗 Transformers لها أوزان مرتبطة بين طبقة `Embedding` وطبقة `Decoding`. لا يسمح لك TorchScript بتصدير النماذج ذات الأوزان المرتبطة، لذلك من الضروري فصل الأوزان ونسخها مسبقًا. + +النماذج المُهيأة باستخدام علامة `torchscript` لها طبقة `Embedding` وطبقة`Decoding` منفصلتين، مما يعني أنه لا ينبغي تدريبها لاحقًا. سيؤدي التدريب إلى عدم تزامن الطبقتين، مما يؤدي إلى نتائج غير متوقعة. + +هذا لا ينطبق على النماذج التي لا تحتوي على رأس نموذج اللغة، حيث لا تملك أوزانًا مرتبطة. يمكن تصدير هذه النماذج بأمان دون علامة `torchscript`. + +## المدخلات الوهمية والأطوال القياسية + +تُستخدم المُدخلات الوهمية لتمرير أمامي خلال النموذج. أثناء انتشار قيم المُدخلات عبر الطبقات، يتتبع PyTorch العمليات المختلفة التي يتم تنفيذها على كل مصفوفة(tensor). ثم يتم استخدام هذه العمليات المُسجلة بعد ذلك لإنشاء *أثر* النموذج. + +يتم إنشاء التتبع بالنسبة لأبعاد المُدخلات. وبالتالي، فهو مُقيّد بأبعاد المُدخلات الوهمية، ولن يعمل لأي طول تسلسل أو حجم دفعة مختلف. عند المحاولة بحجم مختلف، يتم رفع الخطأ التالي: + +``` +`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2` +``` + +نوصي بتتبع النموذج باستخدام حجم مُدخلات وهمية لا يقل عن أكبر مُدخل سيتم تقديمه للنموذج أثناء الاستدلال. يمكن أن تساعد الحشوة(padding) في ملء القيم المفقودة. ومع ذلك، نظرًا لتتبع النموذج بحجم مُدخل أكبر، ستكون أبعاد المصفوفة ستكون كبيرة أيضًا، مما يؤدي عنه المزيد من الحسابات. + +انتبه إلى إجمالي عدد العمليات المُنفذة على كل مُدخل وتابع الأداء عن كثب عند تصدير نماذج متغيرة طول التسلسل. + +## استخدام TorchScript في Python + +يوضح هذا القسم كيفية حفظ النماذج وتحميلها، بالإضافة إلى كيفية استخدام التتبع للاستدلال. + +### حفظ نموذج + +لتصدير `BertModel` باستخدام TorchScript، قم بتهيئة ـ `BertModel` من فئة `BertConfig` ثم احفظه على القرص تحت اسم الملف `traced_bert.pt`: + +```python +from transformers import BertModel, BertTokenizer, BertConfig +import torch + +enc = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") + +# Tokenizing input text +text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" +tokenized_text = enc.tokenize(text) + +# Masking one of the input tokens +masked_index = 8 +tokenized_text[masked_index] = "[MASK]" +indexed_tokens = enc.convert_tokens_to_ids(tokenized_text) +segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] + +# Creating a dummy input +tokens_tensor = torch.tensor([indexed_tokens]) +segments_tensors = torch.tensor([segments_ids]) +dummy_input = [tokens_tensor, segments_tensors] + +# Initializing the model with the torchscript flag +# Flag set to True even though it is not necessary as this model does not have an LM Head. +config = BertConfig( + vocab_size_or_config_json_file=32000, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + torchscript=True, +) + +# Instantiating the model +model = BertModel(config) + +# The model needs to be in evaluation mode +model.eval() + +# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag +model = BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True) + +# Creating the trace +traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors]) +torch.jit.save(traced_model, "traced_bert.pt") +``` + +### تحميل نموذج + +يمكنك الآن تحميل `BertModel` المُحفظ سابقًا، `traced_bert.pt`، من القرص واستخدامه على `dummy_input` المُهيأ سابقًا: + +```python +loaded_model = torch.jit.load("traced_bert.pt") +loaded_model.eval() + +all_encoder_layers, pooled_output = loaded_model(*dummy_input) +``` + +### استخدام نموذج مُتتبع للاستدلال + +استخدم النموذج المُتتبع للاستدلال باستخدام أسلوب `__call__` الخاص به: + +```python +traced_model(tokens_tensor, segments_tensors) +``` + +## نشر نماذج Hugging Face TorchScript على AWS باستخدام Neuron SDK + +قدمت AWS عائلة [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) من اﻷجهزة لخفض التكلفة وأداء التعلم الآلي عالي الأداء في البيئة السحابية. تعمل أجهزة Inf1 بواسطة شريحة Inferentia من AWS، وهي مُسرّع أجهزة مُخصص، متخصص في أعباء عمل الاستدلال للتعلم العميق. [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) هي SDK لـ Inferentia التي تدعم تتبع نماذج المحولات وتحسينها للنشر على Inf1. توفر Neuron SDK ما يلي: + +1. واجهة برمجة تطبيقات سهلة الاستخدام مع تغيير سطر واحد من التعليمات البرمجية لتتبع نموذج TorchScript وتحسينه للاستدلال في البيئة السحابية. +2. تحسينات الأداء الجاهزة للاستخدام [تحسين التكلفة والأداء](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>). +3. دعم نماذج Hugging Face المحولات المبنية باستخدام إما [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) أو [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html). + +### الآثار المترتبة + +تعمل نماذج المحولات المستندة إلى بنية [BERT (تمثيلات الترميز ثنائية الاتجاه من المحولات)](https://huggingface.co/docs/transformers/main/model_doc/bert) أو متغيراتها مثل [distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) و [roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta) بشكل أفضل على Inf1 للمهام غير التوليدية مثل الإجابة على الأسئلة الاستخراجية، وتصنيف التسلسلات، وتصنيف الرموز (tokens). ومع ذلك، يمكن تكييف مهام توليد النصوص للعمل على Inf1 وفقًا لهذا [برنامج تعليمي AWS Neuron MarianMT](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html). يمكن العثور على مزيد من المعلومات حول النماذج التي يمكن تحويلها جاهزة على Inferentia في قسم [ملاءمة بنية النموذج](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia) من وثائق Neuron. + +### التبعيات (Dependencies) + +يتطلب استخدام AWS Neuron لتحويل النماذج [بيئة SDK Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide) والتي تأتي مسبقًا على [AMI للتعلم العميق من AWS](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html). + +### تحويل نموذج لـ AWS Neuron + +قم بتحويل نموذج لـ AWS NEURON باستخدام نفس التعليمات البرمجية من [استخدام TorchScript في Python](torchscript#using-torchscript-in-python) لتتبع `BertModel`. قم باستيراد امتداد إطار عمل `torch.neuron` للوصول إلى مكونات Neuron SDK من خلال واجهة برمجة تطبيقات Python: + +```python +from transformers import BertModel, BertTokenizer, BertConfig +import torch +import torch.neuron +``` + +كل ما عليك فعله هو تعديل السطر التالي: + +```diff +- torch.jit.trace(model, [tokens_tensor, segments_tensors]) ++ torch.neuron.trace(model, [token_tensor, segments_tensors]) +``` + +يتيح ذلك لـ Neuron SDK تتبع النموذج وتحسينه لمثيلات Inf1. + +لمعرفة المزيد حول ميزات AWS Neuron SDK والأدوات ودروس البرامج التعليمية والتحديثات الأخيرة، يرجى الاطلاع على [وثائق AWS NeuronSDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html). diff --git a/docs/source/ar/trainer.md b/docs/source/ar/trainer.md new file mode 100644 index 00000000000000..7da7cbf4e1714b --- /dev/null +++ b/docs/source/ar/trainer.md @@ -0,0 +1,720 @@ +# Trainer + +تُتيح وحدة [`Trainer`] حلقة تدريب وتقييم متكاملة لنماذج PyTorch المطبقة في مكتبة Transformers. تحتاج فقط إلى تمرير المكونات الضرورية للتدريب (النموذج، والمجزىء النصى، ومجموعة البيانات، دالة التقييم، معلمات التدريب الفائقة، إلخ)، وستتولى فئة [`Trainer`] الباقي. هذا يُسهّل بدء التدريب بشكل أسرع دون كتابة حلقة التدريب الخاصة بك يدويًا. ولكن في الوقت نفسه، فإن [`Trainer`] قابل للتخصيص بدرجة كبيرة ويوفر العديد من خيارات التدريب حتى تتمكن من تخصيصه وفقًا لاحتياجات التدريب الخاصة بك بدقة. + + + +بالإضافة إلى فئة [`Trainer`], توفر مكتبة Transformers أيضًا فئة [`Seq2SeqTrainer`] للمهام التسلسلية مثل الترجمة أو التلخيص. هناك أيضًا فئة [`~trl.SFTTrainer`] من مكتبة [TRL](https://hf.co/docs/trl) التي تغلّف فئة [`Trainer`] وهي مُحُسَّنة لتدريب نماذج اللغة مثل Llama-2 وMistral باستخدام تقنيات التوليد اللغوي. كما يدعم [`~trl.SFTTrainer`] ميزات مثل حزم التسلسلات، وLoRA، والقياس الكمي، وDeepSpeed مما يُمكّن من التدريب بكفاءة على نماذج ضخمة الحجم. + +
+ +لا تتردد في الاطلاع على [مرجع API](./main_classes/trainer) لهذه الفئات الأخرى من النوع [`Trainer`] لمعرفة المزيد حول متى يتم استخدام كل منها. بشكل عام، [`Trainer`] هو الخيار الأكثر تنوعًا ومناسبًا لمجموعة واسعة من المهام. تم تصميم [`Seq2SeqTrainer`] للمهام التسلسلية ، و [`~trl.SFTTrainer`] مُصمم لتدريب نماذج اللغة الكبيرة. + +
+ +قبل البدء، تأكد من تثبيت مكتبة [Accelerate](https://hf.co/docs/accelerate) - وهي مكتبة تُمكّن تشغيل تدريب PyTorch في بيئات مُوزعة. + +```bash +pip install accelerate + +# upgrade +pip install accelerate --upgrade +``` + +يوفر هذا الدليل نظرة عامة على فئة [`Trainer`]. + +## الاستخدام الأساسي + +يتضمن [`Trainer`] جميع التعليمات البرمجية التي ستجدها في حلقة التدريب الأساسية: + +1. قم بتنفيذ خطوة تدريب لحساب الخسارة +2. احسب المشتقات باستخدام طريقة [`~accelerate.Accelerator.backward`] +3. تحديث الأوزان بناءً على المشتقات +4. كرر هذه العملية حتى تصل إلى عدد محدد مسبقًا من الدورات (epochs). + +تُجرد فئة [`Trainer`] كل هذه التعليمات البرمجية حتى لا تضطر إلى القلق بشأن كتابة حلقة تدريب يدويًا في كل مرة أما إذا كنت بدأت للتو في PyTorch والتدريب. كل ما عليك فعله هو توفير المكونات الأساسية اللازمة للتدريب، مثل النموذج ومجموعة بيانات، وتتعامل فئة [`Trainer`] مع كل شيء آخر. + +إذا كنت تُريد تحديد أي خيارات تدريب أو معلمات فائقة، فيمكنك العثور عليها في فئة [`TrainingArguments`]. على سبيل المثال، دعنا نحدد أين يتم حفظ النموذج في `output_dir` ورفع النموذج إلى Hub بعد التدريب باستخدام `push_to_hub=True`. + +```py +from transformers import TrainingArguments + +training_args = TrainingArguments( + output_dir="your-model"، + learning_rate=2e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + num_train_epochs=2, + weight_decay=0.01, + eval_strategy="epoch"، + save_strategy="epoch"، + load_best_model_at_end=True, + push_to_hub=True, +) +``` +مرر `training_args` إلى [`Trainer`] جنبًا إلى جنب مع النموذج، ومجموعة بيانات، وشئ لمعالجة مجموعة البيانات مسبقًا (حسب نوع البيانات، فقد يكون محللًا رمزيًا أو مستخرج ميزات أو معالج صور)، وجامع بيانات، ودالة لحساب المقاييس التي تُريد تتبعها أثناء التدريب. + +أخيرًا، استدعِ [`~Trainer.train`] لبدء التدريب! + +```py +from transformers import Trainer + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=dataset["train"]، + eval_dataset=dataset["test"]، + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, +) + +trainer.train() +``` + +### نقاط الحفظ + +تحفظ فئة [`Trainer`] نقاط الحفظ النموذج في الدليل المحدد في معامل `output_dir` من [`TrainingArguments`]. ستجد نقاط الحفظ في مجلد فرعي يسمى `checkpoint-000` حيث تتوافق الأرقام في النهاية مع خطوة التدريب. إن حفظ نقاط الحفظ مفيد لاستئناف التدريب لاحقًا. + +```py +# استأنف من أحدث نقطة حفظ +trainer.train(resume_from_checkpoint=True) + +# استأنف من نقطة حفظ محددة محفوظة في دليل الإخراج +trainer.train(resume_from_checkpoint="your-model/checkpoint-1000") +``` + +يمكنك حفظ نقاط الحفظ الخاصة بك (لا يتم حفظ حالة المُجزىء اللغوى تقائيًا) إلى Hub عن طريق تعيين `push_to_hub=True` في [`TrainingArguments`] لرفعها. الخيارات الأخرى لاتخاذ القرار بشأن كيفية حفظ هذة النقاط الخاصة بك هي الإعداد في معامل [`hub_strategy`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.hub_strategy): + +* `hub_strategy="checkpoint"` يدفع أحدث نقطة حفظ إلى مجلد فرعي يسمى "last-checkpoint" يمكنك استئناف التدريب منه +* `hub_strategy="all_checkpoints"` يدفع جميع نقاط الحفظ إلى الدليل المحدد في `output_dir` (سترى نقطة حفظ واحدة لكل مجلد في مستودع النموذج الخاص بك) + +عند استئناف التدريب من نقطة حفظ، تُحاول [`Trainer`] الحفاظ على حالات RNG Python وNumPy وPyTorch كما كانت عندما تم حفظ نقطة الحفظ. ولكن لأن PyTorch لديها العديد من الإعدادات الافتراضية غير الحتمية مُتنوعة، فإن حالات RNG ليست مضمونة لتكون هي نفسها. إذا كنت تريد تمكين الحتمية الكاملة، فراجع دليل [التحكم في مصادر العشوائية](https://pytorch.org/docs/stable/notes/randomness#controlling-sources-of-randomness) لمعرفة ما يُمكنك تمكينه لجعل تدريبك حتميًا تمامًا. ضع في اعتبارك أنه من خلال جعل إعدادات معينة حتمية، فقد يكون التدريب أبطأ. + +## تخصيص المدرب + +في حين أن فئة [`Trainer`] مُصممة لتكون سهلة الوصول وسهلة الاستخدام، فإنها توفر أيضًا الكثير من قابلية التخصيص للمستخدمين المغامرين. يُمكن إنشاء فئات فرعية من العديد من أساليب [`Trainer`] وتجاوزها لدعم الوظائف التي تُريدها، دون الحاجة إلى إعادة كتابة حلقة التدريب بأكملها من البداية لاستيعابها. تتضمن هذه الأساليب: + +* [`~Trainer.get_train_dataloader`] ينشئ DataLoader للتدريب +* [`~Trainer.get_eval_dataloader`] ينشئ DataLoader للتقييم +* [`~Trainer.get_test_dataloader`] ينشئ DataLoader للاختبار +* [`~Trainer.log`] يسجل معلومات حول مختلف الكائنات التي تراقب التدريب +* [`~Trainer.create_optimizer_and_scheduler`] ينشئ محسنًا ومخططًا لمُعدل التعلم إذا لم يتم تمريرهما في `__init__`؛ يمكن أيضًا تخصيص هذه الوظائف بشكل منفصل باستخدام [`~Trainer.create_optimizer`] و [`~Trainer.create_scheduler`] على التوالي +* [`~Trainer.compute_loss`] يحسب دالة الخسارة على دفعة من مُدخلات التدريب +* [`~Trainer.training_step`] يُنفذ خطوة التدريب +* [`~Trainer.prediction_step`] يُنفذ خطوة التنبؤ والاختبار +* [`~Trainer.evaluate`] يُقيّم النموذج ويعيد مقاييس التقييم +* [`~Trainer.predict`] يُجري التنبؤات (مع المقاييس إذا كانت العلامات متاحة) على مجموعة الاختبار + +على سبيل المثال، إذا كنت تريد تخصيص طريقة [`~Trainer.compute_loss`] لاستخدام دالة خسارة ذات ترجيح بدلاً من ذلك. + + +```py +from torch import nn +from transformers import Trainer + +class CustomTrainer(Trainer): + def compute_loss(self, model, inputs, return_outputs=False): + labels = inputs.pop("labels") + # forward pass + outputs = model(**inputs) + logits = outputs.get("logits") + # compute custom loss for 3 labels with different weights + loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0], device=model.device)) + loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1)) + return (loss, outputs) if return_outputs else loss +``` + +### دوال الاستدعاء Callbacks + +خيار آخر لتخصيص [`Trainer`] هو استخدام [دوال الاستدعاء](callbacks). لا *تغير* دوال الاستدعاء أي شيء في حلقة التدريب. إنهم تفحص حالة حلقة التدريب ثم تُنفذ بعض الإجراءات (مثل الإيقاف المبكر أو تسجيل النتائج، إلخ) اعتمادًا على الحالة. وبعبارة أخرى، لا يمكن استخدام دالة الاستدعاء لتنفيذ شيء مثل دالة خسارة مخصصة، ويجب عليك تجاوز دالة [`~Trainer.compute_loss`] لذلك. + +على سبيل المثال، إذا كنت تريد إضافة دالة استدعاء إيقاف مبكر إلى حلقة التدريب بعد 10 خطوات. + +```py +from transformers import TrainerCallback + +class EarlyStoppingCallback(TrainerCallback): + def __init__(self, num_steps=10): + self.num_steps = num_steps + + def on_step_end(self, args, state, control, **kwargs): + if state.global_step >= self.num_steps: + return {"should_training_stop": True} + else: + return {} +``` + +ثم مرره إلى معامل `callback` في [`Trainer`]. + +```py +from transformers import Trainer + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=dataset["train"]، + eval_dataset=dataset["test"]، + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + callback=[EarlyStoppingCallback()], +) +``` + +## تسجيل الأحداث (Logging) + + + +راجع مرجع [API](./main_classes/logging) للتسجيل للحصول على مزيد من المعلومات حول مستويات التسجيل المختلفة للأحداث. + + + +يتم تعيين [`Trainer`] إلى `logging.INFO` افتراضيًا والذي يُبلغ عن الأخطاء والتحذيرات ومعلومات أساسية أخرى. يتم تعيين نسخة [`Trainer`] - في البيئات الموزعة - إلى `logging.WARNING` والتي يُبلغ فقط عن الأخطاء والتحذيرات. يمكنك تغيير مستوى تسجيل الأحداث باستخدام معاملي [`log_level`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.log_level) و [`log_level_replica`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.log_level_replica) في [`TrainingArguments`]. + +لتهيئة إعداد مُستوى تسجيل اﻷحداث لكل عقدة، استخدم معامل [`log_on_each_node`](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments.log_on_each_node) لتحديد ما إذا كان سيتم استخدام مُستوى السجل على كل عقدة أو فقط على العقدة الرئيسية. + + + +يحدد [`Trainer`] مُستوى التسجيل بشكل مُنفصل لكل عقدة في طريقة [`Trainer.__init__`]، لذا فقد ترغب في التفكير في تعيين هذا الإعداد في وقت سابق إذا كنت تستخدم وظائف Transformers الأخرى قبل إنشاء كائن [`Trainer`]. + + + +على سبيل المثال، لتعيين التعليمات البرمجية والوحدات النمطية الرئيسية الخاصة بك لاستخدام نفس مُستوى التسجيل وفقًا لكل عقدة: + +```py +logger = logging.getLogger(__name__) + +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s"، + datefmt="%m/%d/%Y %H:%M:%S"، + handlers=[logging.StreamHandler(sys.stdout)], +) + +log_level = training_args.get_process_log_level() +logger.setLevel(log_level) +datasets.utils.logging.set_verbosity(log_level) +transformers.utils.logging.set_verbosity(log_level) + +trainer = Trainer(...) +``` + +استخدم تركيبات مختلفة من `log_level` و `log_level_replica` لتهيئة ما يتم تسجيله على كل من العقد. + + + + + +```bash +my_app.py ... --log_level warning --log_level_replica error +``` + + + + +أضف معلمة `log_on_each_node 0` لبيئات متعددة العقد. + +```bash +my_app.py ... --log_level warning --log_level_replica error --log_on_each_node 0 + +# set to only report errors +my_app.py ... --log_level error --log_level_replica error --log_on_each_node 0 +``` + + + + +## NEFTune + +[NEFTune](https://hf.co/papers/2310.05914) هي تقنية يمكن أن تحسن الأداء عن طريق إضافة ضوضاء إلى مُتجهات التعلم أثناء التدريب. لتمكينه في [`Trainer`], قم بتعيين معامل `neftune_noise_alpha` في [`TrainingArguments`] للتحكم في مقدار الضوضاء المُضافة. + +```py +from transformers import TrainingArguments, Trainer + +training_args = TrainingArguments(..., neftune_noise_alpha=0.1) +trainer = Trainer(..., args=training_args) +``` + +يتم تعطيل NEFTune بعد التدريب لاستعادة طبقة التعلم الأصلية لتجنب أي سلوك غير متوقع. + +## نواة Liger +[Liger-Kernel](https://github.com/linkedin/Liger-Kernel) Kernel هي مجموعة من نوى Triton التي طورتها Linkedin مُصممة خصيصًا لتدريب نماذج اللغة الكبيرة (LLM). لقد قمنا بتنفيذ RMSNorm و RoPE و SwiGLU و CrossEntropy و FusedLinearCrossEntropy مُتوافقة مع Hugging Face، والمزيد قادم. يُمكنها زيادة إنتاجية التدريب متعدد وحدات معالجة الرسومات (GPU) بنسبة 20٪ وتقليل استخدام الذاكرة بنسبة 60٪. تعمل النواة بشكل تلقائي مع flash attention و PyTorch FSDP و Microsoft DeepSpeed. + +احصل على زيادة في الإنتاجية بنسبة 20٪ وتقليل استخدام الذاكرة بنسبة 60٪ على تدريب نماذج LLaMA 3-8B. حقق أطوال سياق أكبر وأحجام دفعات أكبر. كما أنها مُفيدة إذا كنت تُريد زيادة حجم نموذجك إلى تدريب بنماذج متعددة الرؤوس أو أحجام مُفردات ضخمة. أطلق العنان للتدريب بنماذج متعددة الرؤوس (medusa) والمزيد. راجع التفاصيل والأمثلة في [Liger](https://github.com/linkedin/Liger-Kernel/tree/main/examples) +تأكد أولاً من تثبيت مستودع Liger الرسمي: +```bash +pip install liger-kernel +``` +يجب عليك تمرير `use_liger_kernel=True` لتطبيق نواة `liger` على نموذجك، على سبيل المثال: + +```python +from transformers import TrainingArguments + +training_args = TrainingArguments( + output_dir="your-model", + learning_rate=2e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + num_train_epochs=2, + weight_decay=0.01, + eval_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + push_to_hub=True, + use_liger_kernel=True +) +``` + +تدعم النواة معماريات نماذج Llama و Gemma و Mistral و Mixtral. يُمكن العثور على أحدث قائمة بالنمائج المدعومة [هنا](https://github.com/linkedin/Liger-Kernel). عندما يتم تعيين `use_liger_kernel` إلى `True`، سيتم تصحيح الطبقات المُقابلة في النموذج الأصلي باستخدام تطبيق Liger الفعال، لذلك لا تحتاج إلى فعل أي شيء إضافي بخلاف تعيين قيمة المعامل. + +## المُحسِّنات +يمكنك اختيار مُحسِّن مدمج للتدريب باستخدام: +```python +from transformers import TrainingArguments +training_args = TrainingArguments(..., optim="adamw_torch") +``` +اطلع على [`OptimizerNames`](https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py) للاطلاع على القائمة الكاملة للخيارات. نُدرج أمثلة مُتقدمة في الأقسام أدناه. + +يمكنك أيضًا استخدام مُحسِّن PyTorch عشوائي عبر: +```python +import torch + +optimizer_cls = torch.optim.AdamW +optimizer_kwargs = { + "lr": 4e-3, + "betas": (0.9, 0.999), + "weight_decay": 0.05, +} + +from transformers import Trainer +trainer = Trainer(..., optimizer_cls_and_kwargs=(optimizer_cls, optimizer_kwargs)) +``` + + + + +### GaLore + +إسقاط التدرج ذو الرتبة المنخفضة (GaLore) هو إستراتيجية تدريب ذات رتبة منخفضة فعّالة من حيث الذاكرة، تسمح بتعلم المعلمات الكاملة ولكنها أكثر كفاءة من حيث الذاكرة من أساليب التكيّف الشائعة ذات الرتبة المنخفضة، مثل LoRA. + +أولاً، تأكد من تثبيت المستودع الرسمي لـ GaLore: + +```bash +pip install galore-torch +``` + +ثم أضف ببساطة أحد `["galore_adamw"، "galore_adafactor"، "galore_adamw_8bit"]` في `optim` جنبًا إلى جنب مع `optim_target_modules`، والتي يمكن أن تكون قائمة من السلاسل أو التعبيرات النمطية regex أو المسار الكامل المطابق لأسماء الوحدات المستهدفة التي تريد تكييفها. فيما يلي مثال على النص البرمجي كامل(تأكد من `pip install trl datasets`): + +```python +import torch +import datasets +import trl + +from transformers import TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM + +train_dataset = datasets.load_dataset('imdb', split='train') + +args = TrainingArguments( + output_dir="./test-galore"، + max_steps=100, + per_device_train_batch_size=2, + optim="galore_adamw"، + optim_target_modules=[r".*.attn.*"، r".*.mlp.*"] +) + +model_id = "google/gemma-2b" + +config = AutoConfig.from_pretrained(model_id) + +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = AutoModelForCausalLM.from_config(config).to(0) + +trainer = trl.SFTTrainer( + model=model, + args=args, + train_dataset=train_dataset, + dataset_text_field='text', + max_seq_length=512, +) + +trainer.train() +``` + +لتمرير معامﻻت إضافية يدعمها GaLore، يجب عليك تمرير `optim_args` بشكل صحيح، على سبيل المثال: + +```python +import torch +import datasets +import trl + +from transformers import TrainingArguments, AutoConfig, AutoTokenizer, AutoModelForCausalLM + +train_dataset = datasets.load_dataset('imdb', split='train') + +args = TrainingArguments( + output_dir="./test-galore", + max_steps=100, + per_device_train_batch_size=2, + optim="galore_adamw", + optim_target_modules=[r".*.attn.*", r".*.mlp.*"], + optim_args="rank=64, update_proj_gap=100, scale=0.10", +) + +model_id = "google/gemma-2b" + +config = AutoConfig.from_pretrained(model_id) + +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = AutoModelForCausalLM.from_config(config).to(0) + +trainer = trl.SFTTrainer( + model=model, + args=args, + train_dataset=train_dataset, + dataset_text_field='text', + max_seq_length=512, +) + +trainer.train() +``` +يمكنك قراءة المزيد حول الطريقة في [المستودع الأصلي](https://github.com/jiaweizzhao/GaLore) أو [الورقة البحثية](https://arxiv.org/abs/2403.03507). + +حاليًا، يمكنك فقط تدريب الطبقات الخطية التي تعتبر طبقات GaLore وستستخدم التحلل ذو الرتبة المنخفضة للتدريب بينما سيتم تحسين الطبقات المتبقية بالطريقة التقليدية. + +لاحظ أنه سيستغرق الأمر بعض الوقت قبل بدء التدريب (~3 دقائق لنموذج 2B على NVIDIA A100)، ولكن يجب أن يسير التدريب بسلاسة بعد ذلك. + +يمكنك أيضًا إجراء تحسين طبقة تلو الأخرى عن طريق إضافة `layerwise` إلى اسم المُحسِّن كما هو موضح أدناه: + +```python +import torch +import datasets +import trl + +from transformers import TrainingArguments، AutoConfig، AutoTokenizer، AutoModelForCausalLM + +train_dataset = datasets.load_dataset('imdb'، split='train') + +args = TrainingArguments( + output_dir="./test-galore"، + max_steps=100، + per_device_train_batch_size=2، + optim="galore_adamw_layerwise"، + optim_target_modules=[r".*.attn.*"، r".*.mlp.*"] +) + +model_id = "google/gemma-2b" + +config = AutoConfig.from_pretrained(model_id) + +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = AutoModelForCausalLM.from_config(config).to(0) + +trainer = trl.SFTTrainer( + model=model، + args=args، + train_dataset=train_dataset، + dataset_text_field='text'، + max_seq_length=512، +) + +trainer.train() +``` + +لاحظ أن تحسين الطبقة تجريبي إلى حد ما ولا يدعم DDP (Distributed Data Parallel)، وبالتالي يمكنك تشغيل التعليمات البرمجية للتدريب على وحدة معالجة الرسومات (GPU) واحدة فقط. يرجى الاطلاع على [هذا القسم المناسب](https://github.com/jiaweizzhao/GaLore?tab=readme-ov-file#train-7b-model-with-a-single-gpu-with-24gb-memory) لمزيد من التفاصيل. قد لا تدعم الميزات الأخرى مثل تقليم التدرجات أو DeepSpeed، إلخ. من الصندوق. يرجى [تقديم تقرير عن المشكلة على GitHub](https://github.com/huggingface/transformers/issues) إذا واجهتك مثل هذه المشكلة. + +### محسنات LOMO + +تم تقديم مُحسِّنات LOMO في [التدريب على المعلمات الكاملة لنماذج اللغة الكبيرة باستخدام موارد محدودة](https://hf.co/papers/2306.09782) و [AdaLomo: تحسين ذاكرة منخفضة بمعدل تعلم متكيف](https://hf.co/papers/2310.10195). +يتكون كلاهما من طريقة فعالة لضبط المعلمات الكاملة. تدمج محسنات LOMO حساب الاشتقاق وتحديث المعلمات في خطوة واحدة لتقليل استخدام الذاكرة. محسنات LOMO المدعومة هي `"lomo"` و `"adalomo"`. أولاً قم بتثبيت LOMO من pypi `pip install lomo-optim` أو قم بتثبيته من المصدر باستخدام `pip install git+https://github.com/OpenLMLab/LOMO.git`. + + + +وفقًا للمؤلفين، يوصى باستخدام `AdaLomo` بدون `grad_norm` للحصول على أداء أفضل وسرعة أعلى. + + + +فيما يلي نص برمجي بسيط يوضح كيفية ضبط نموذج [google/gemma-2b](https://huggingface.co/google/gemma-2b) على مجموعة بيانات IMDB في الدقة الكاملة: + +```python +import torch +import datasets +from transformers import TrainingArguments، AutoTokenizer، AutoModelForCausalLM +import trl + +train_dataset = datasets.load_dataset('imdb'، split='train') + +args = TrainingArguments( + output_dir="./test-lomo"، + max_steps=100، + per_device_train_batch_size=4، + optim="adalomo"، + gradient_checkpointing=True، + logging_strategy="steps"، + logging_steps=1، + learning_rate=2e-6، + save_strategy="no"، + run_name="lomo-imdb"، +) + +model_id = "google/gemma-2b" + +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = AutoModelForCausalLM.from_pretrained(model_id، low_cpu_mem_usage=True).to(0) + +trainer = trl.SFTTrainer( + model=model، + args=args، + train_dataset=train_dataset، + dataset_text_field='text'، + max_seq_length=1024، +) + +trainer.train() +``` + +### مُحسِّن GrokAdamW +تم تصميم مُحسِّن GrokAdamW لتعزيز أداء التدريب واستقراره، خاصةً للنماذج التي تستفيد من دوال إشارة `grokking`. لاستخدام `GrokAdamW`، قم أولاً بتثبيت حزمة المُحسِّن باستخدام `pip install grokadamw`. + +يُعد GrokAdamW مفيدًا بشكل خاص للنماذج التي تتطلب تقنيات تحسين مُتقدمة لتحقيق أداء واستقرار أفضل. + + +فيما يلي نص برمجى بسيط لشرح كيفية ضبط [google/gemma-2b](https://huggingface.co/google/gemma-2b) بدقة على مجموعة بيانات IMDB باستخدام مُحسِّن GrokAdamW: +```python +import torch +import datasets +from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM, Trainer + +# تحميل مجموعة البيانات IMDB +train_dataset = datasets.load_dataset('imdb', split='train') + +# تعريف معامﻻت التدريب +args = TrainingArguments( + output_dir="./test-grokadamw", + max_steps=1000, + per_device_train_batch_size=4, + optim="grokadamw", + logging_strategy="steps", + logging_steps=1, + learning_rate=2e-5, + save_strategy="no", + run_name="grokadamw-imdb", +) + +# تحميل النموذج والمجزىء اللغوي +model_id = "google/gemma-2b" +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0) + +# تهيئة المدرب +trainer = Trainer( + model=model, + args=args, + train_dataset=train_dataset, +) + +# تدريب النموذج +trainer.train() +``` +يوضح هذا النص البرمجى كيفية ضبط نموذج google/gemma-2b بدقة على مجموعة بيانات IMDB باستخدام مُحسِّن GrokAdamW. يتم تكوين TrainingArguments لاستخدام GrokAdamW، ويتم تمرير مجموعة البيانات إلى Trainer للتدريب. + +### مُحسِّن بدون جدوله (Schedule Free Optimizer) +تم تقديم مُحسِّنات بدون جدوله في [The Road Less Scheduled](https://hf.co/papers/2405.15682). +يستبدل التعلم بدون جدوله زخم المُحسِّن الأساسي بمزيج من المتوسط ​​والتداخل، لإزالة الحاجة تمامًا إلى تخفيف مُعدل التعلم باستخدام جدوله تقليديه. +المُحسِّنات المدعومة لـ SFO هي "schedule_free_adamw" و "schedule_free_sgd". قم أولاً بتثبيت `schedulefree` من pypi باستخدام الأمر `pip install schedulefree`. + +فيما يلي نص برمجى بسيط لشرح كيفية ضبط [google/gemma-2b](https://huggingface.co/google/gemma-2b) بدقة على مجموعة بيانات IMDB بدقة كاملة: +```python +import torch +import datasets +from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM +import trl + +train_dataset = datasets.load_dataset('imdb', split='train') + +args = TrainingArguments( + output_dir="./test-schedulefree", + max_steps=1000, + per_device_train_batch_size=4, + optim="schedule_free_adamw", + gradient_checkpointing=True, + logging_strategy="steps", + logging_steps=1, + learning_rate=2e-6, + save_strategy="no", + run_name="sfo-imdb", +) + +model_id = "google/gemma-2b" + +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0) + +trainer = trl.SFTTrainer( + model=model, + args=args, + train_dataset=train_dataset, + dataset_text_field='text', + max_seq_length=1024, +) + +trainer.train() +``` +## تسريع ومدرب + +يتم تشغيل فئة [`Trainer`] بواسطة [تسريع](https://hf.co/docs/accelerate)، وهي مكتبة لتدريب نماذج PyTorch بسهولة في بيئات موزعة مع دعم عمليات التكامل مثل [FullyShardedDataParallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) و [DeepSpeed](https://www.deepspeed.ai/). + + + +تعرف على المزيد حول استراتيجيات تجزئة FSDP، وتفريغ وحدة المعالجة المركزية (CPU)، والمزيد مع [`Trainer`] في [دليل Fully Sharded Data Parallel](fsdp). + + + +لاستخدام Accelerate مع [`Trainer`]]، قم بتشغيل الأمر [`accelerate.config`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-config) لإعداد التدريب لبيئة التدريب الخاصة بك. نشئ هذا الأمر ملف `config_file.yaml` الذي سيتم استخدامه عند تشغيل نص للتدريب البرمجى. على سبيل المثال، بعض تكوينات المثال التي يمكنك إعدادها هي: + + + + +```yml +compute_environment: LOCAL_MACHINE +distributed_type: MULTI_GPU +downcast_bf16: 'no' +gpu_ids: all +machine_rank: 0 #change rank as per the node +main_process_ip: 192.168.20.1 +main_process_port: 9898 +main_training_function: main +mixed_precision: fp16 +num_machines: 2 +num_processes: 8 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false +``` + + + + +```yml +compute_environment: LOCAL_MACHINE +distributed_type: FSDP +downcast_bf16: 'no' +fsdp_config: + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + fsdp_backward_prefetch_policy: BACKWARD_PRE + fsdp_forward_prefetch: true + fsdp_offload_params: false + fsdp_sharding_strategy: 1 + fsdp_state_dict_type: FULL_STATE_DICT + fsdp_sync_module_states: true + fsdp_transformer_layer_cls_to_wrap: BertLayer + fsdp_use_orig_params: true +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 2 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false +``` + + + + +```yml +compute_environment: LOCAL_MACHINE +deepspeed_config: + deepspeed_config_file: /home/user/configs/ds_zero3_config.json + zero3_init_flag: true +distributed_type: DEEPSPEED +downcast_bf16: 'no' +machine_rank: 0 +main_training_function: main +num_machines: 1 +num_processes: 4 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false +``` + + + + +```yml +compute_environment: LOCAL_MACHINE +deepspeed_config: + gradient_accumulation_steps: 1 + gradient_clipping: 0.7 + offload_optimizer_device: cpu + offload_param_device: cpu + zero3_init_flag: true + zero_stage: 2 +distributed_type: DEEPSPEED +downcast_bf16: 'no' +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 4 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false +``` + + + +يُعد أمر [`accelerate_launch`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-launch) هو الطريقة المُوصى بها لتشغيل نص البرمجى للتدريب على نظام موزع باستخدام Accelerate و [`Trainer`] مع المعلمات المحددة في `config_file.yaml`. يتم حفظ هذا الملف في مجلد ذاكرة التخزين المؤقت لـ Accelerate ويتم تحميله تلقائيًا عند تشغيل `accelerate_launch`. + +على سبيل المثال، لتشغيل النص البرنامجي للتدريب [run_glue.py](https://github.com/huggingface/transformers/blob/f4db565b695582891e43a5e042e5d318e28f20b8/examples/pytorch/text-classification/run_glue.py#L4) مع تكوين FSDP: + +```bash +accelerate launch \ + ./examples/pytorch/text-classification/run_glue.py \ + --model_name_or_path google-bert/bert-base-cased \ + --task_name $TASK_NAME \ + --do_train \ + --do_eval \ + --max_seq_length 128 \ + --per_device_train_batch_size 16 \ + --learning_rate 5e-5 \ + --num_train_epochs 3 \ + --output_dir /tmp/$TASK_NAME/ \ + --overwrite_output_dir +``` + +يمكنك أيضًا تحديد المعلمات من ملف `config_file.yaml` مباشرة في سطر الأوامر: + +```bash +accelerate launch --num_processes=2 \ + --use_fsdp \ + --mixed_precision=bf16 \ + --fsdp_auto_wrap_policy=TRANSFORMER_BASED_WRAP \ + --fsdp_transformer_layer_cls_to_wrap="BertLayer" \ + --fsdp_sharding_strategy=1 \ + --fsdp_state_dict_type=FULL_STATE_DICT \ + ./examples/pytorch/text-classification/run_glue.py + --model_name_or_path google-bert/bert-base-cased \ + --task_name $TASK_NAME \ + --do_train \ + --do_eval \ + --max_seq_length 128 \ + --per_device_train_batch_size 16 \ + --learning_rate 5e-5 \ + --num_train_epochs 3 \ + --output_dir /tmp/$TASK_NAME/ \ + --overwrite_output_dir +``` + +اطلع على برنامج تعليمي [Launching your Accelerate scripts](https://huggingface.co/docs/accelerate/basic_tutorials/launch) لمعرفة المزيد حول `accelerate_launch` والتكوينات المخصصة. diff --git a/docs/source/ar/troubleshooting.md b/docs/source/ar/troubleshooting.md new file mode 100644 index 00000000000000..7874a9fad13304 --- /dev/null +++ b/docs/source/ar/troubleshooting.md @@ -0,0 +1,171 @@ +# استكشاف الأخطاء وإصلاحها + +تحدث الأخطاء أحيانًا، لكننا هنا للمساعدة! يغطي هذا الدليل بعض المشكلات الأكثر شيوعًا التي واجهناها وكيفية حلها. مع ذلك، لا يُقصد بهذا الدليل أن يكون مجموعة شاملة لكل مشكلات 🤗 Transformers. لمزيد من المساعدة في استكشاف مشكلتك وإصلاحها، جرب ما يلي: + + + +1. اطلب المساعدة على [المنتديات](https://discuss.huggingface.co/). هناك فئات محددة يمكنك نشر سؤالك فيها، مثل [المبتدئين](https://discuss.huggingface.co/c/beginners/5) أو [🤗 Transformers](https://discuss.huggingface.co/c/transformers/9). تأكد من كتابة منشور جيد وواضح على المنتدى مع بعض التعليمات البرمجية القابلة للتكرار لزيادة احتمالية حل مشكلتك! + + +2. قم بإنشاء [مشكلة](https://github.com/huggingface/transformers/issues/new/choose) في مستودع 🤗 Transformers إذا كانت هناك مشكلة متعلقة بالمكتبة. حاول تضمين أكبر قدر ممكن من المعلومات التي تصف المشكلة لمساعدتنا في معرفة ما هو الخطأ وكيفية إصلاحه. + +3. تحقق من دليل [الترحيل](migration) إذا كنت تستخدم إصدارًا أقدم من مكتبة 🤗 Transformers حيث تم إدخال بعض التغييرات المهمة بين الإصدارات. + + +للحصول على مزيد من التفاصيل حول استكشاف الأخطاء وإصلاحها والحصول على المساعدة، راجع [الفصل 8](https://huggingface.co/course/chapter8/1?fw=pt) من دورة Hugging Face. + +## بيئات جدار الحماية + +بعض وحدات معالجة الرسومات (GPU) على السحابة وإعدادات الشبكة الداخلية محمية بجدار حماية من الاتصالات الخارجية، مما يؤدي إلى حدوث خطأ في الاتصال. عندما تحاول تعليمات البرنامج النصي تنزيل أوزان النموذج أو مجموعات البيانات، سيتوقف التنزيل ثم ينتهي بخطأ مثل: + +``` +ValueError: Connection error, and we cannot find the requested files in the cached path. +Please try again or make sure your Internet connection is on. +``` + +في هذه الحالة، يجب محاولة تشغيل 🤗 Transformers في [وضع عدم الاتصال](installation#offline-mode) لتجنب خطأ الاتصال. + +## CUDA نفاد الذاكرة + +يمكن أن يكون تدريب النماذج الكبيرة التي تحتوي على ملايين المعلمات أمرًا صعبًا بدون الأجهزة المناسبة. أحد الأخطاء الشائعة التي قد تواجهها عند نفاد ذاكرة GPU هو: + +``` +CUDA out of memory. Tried to allocate 256.00 MiB (GPU 0; 11.17 GiB total capacity; 9.70 GiB already allocated; 179.81 MiB free; 9.85 GiB reserved in total by PyTorch) +``` + +فيما يلي بعض الحلول المحتملة التي يمكنك تجربتها لتقليل استخدام الذاكرة: + +- قلل من قيمة [`per_device_train_batch_size`](main_classes/trainer#transformers.TrainingArguments.per_device_train_batch_size) في [`TrainingArguments`]. + +- حاول استخدام [`gradient_accumulation_steps`](main_classes/trainer#transformers.TrainingArguments.gradient_accumulation_steps) في [`TrainingArguments`] لزيادة حجم الدُفعة بشكل فعال. + + +راجع دليل [الأداء](performance) لمزيد من التفاصيل حول تقنيات توفير الذاكرة. + + +## عدم القدرة على تحميل نموذج TensorFlow محفوظ + +تقوم طريقة TensorFlow [model.save](https://www.tensorflow.org/tutorials/keras/save_and_load#save_the_entire_model) بحفظ النموذج بالكامل - الهندسة المعمارية، الأوزان، تكوين التدريب - في ملف واحد. ومع ذلك، عند تحميل ملف النموذج مرة أخرى، قد تواجه خطأ لأن مكتبة 🤗 Transformers قد لا تقوم بتحميل جميع الكائنات المتعلقة بـ TensorFlow في ملف النموذج. لتجنب المشكلات المتعلقة بحفظ وتحميل نماذج TensorFlow، نوصي بما يلي: + +- احفظ أوزان النموذج كملف `h5` باستخدام [`model.save_weights`](https://www.tensorflow.org/tutorials/keras/save_and_load#save_the_entire_model) ثم أعد تحميل النموذج باستخدام [`~TFPreTrainedModel.from_pretrained`]: + +```python +>>> from transformers import TFPreTrainedModel +>>> from tensorflow import keras + +>>> model.save_weights("some_folder/tf_model.h5") +>>> model = TFPreTrainedModel.from_pretrained("some_folder") +``` + +- احفظ النموذج باستخدام [`~TFPretrainedModel.save_pretrained`] وقم بتحميله مرة أخرى باستخدام [`~TFPreTrainedModel.from_pretrained`]: + +```python +>>> from transformers import TFPreTrainedModel + +>>> model.save_pretrained("path_to/model") +>>> model = TFPreTrainedModel.from_pretrained("path_to/model") +``` + +## ImportError + +خطأ شائع آخر قد تواجهه، خاصة إذا كان نموذجًا تم إصداره حديثًا، هو `ImportError`: + +``` +ImportError: cannot import name 'ImageGPTImageProcessor' from 'transformers' (unknown location) +``` + +بالنسبة لأنواع الأخطاء هذه، تحقق من أن لديك أحدث إصدار من مكتبة Hugging Face Transformers مثبتًا للوصول إلى أحدث النماذج: + +```bash +pip install transformers --upgrade +``` + +## خطأ CUDA: تم تشغيل التأكيد على جانب الجهاز + +في بعض الأحيان، قد تواجه خطأ CUDA عامًا حول خطأ في كود الجهاز. + +``` +RuntimeError: CUDA error: device-side assert triggered +``` + +يجب عليك محاولة تشغيل الكود على وحدة المعالجة المركزية (CPU) أولاً للحصول على رسالة خطأ أكثر دقة. أضف متغير البيئة التالي في بداية كودك للتبديل إلى وحدة المعالجة المركزية: + +```python +>>> import os + +>>> os.environ["CUDA_VISIBLE_DEVICES"] = "" +``` + +الخيار الآخر هو الحصول على تتبع مكدس أفضل من GPU. أضف متغير البيئة التالي في بداية كودك للحصول على تتبع المكدس للإشارة إلى مصدر الخطأ: + +```python +>>> import os + +>>> os.environ["CUDA_LAUNCH_BLOCKING"] = "1" +``` + +## إخراج غير صحيح عند عدم إخفاء رموز الحشو + +في بعض الحالات، قد يكون `hidden_state` غير صحيحة إذا تضمنت `input_ids` رموز حشو. ولإثبات ذلك، قم بتحميل نموذج ومجزىء لغوى. يمكنك الوصول إلى `pad_token_id` للنموذج لمعرفة قيمته. قد تكون `pad_token_id` `None` لبعض النماذج، ولكن يمكنك دائمًا تعيينها يدويًا. + +```python +>>> from transformers import AutoModelForSequenceClassification +>>> import torch + +>>> model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased") +>>> model.config.pad_token_id +0 +``` + +يوضح المثال التالي المُخرجات بدون إخفاء رموز الحشو: + +```python +>>> input_ids = torch.tensor([[7592, 2057, 2097, 2393, 9611, 2115], [7592, 0, 0, 0, 0, 0]]) +>>> output = model(input_ids) +>>> print(output.logits) +tensor([[ 0.0082, -0.2307], +[ 0.1317, -0.1683]], grad_fn=) +``` + +هنا المُخرجات الفعلية للتسلسل الثاني: + +```python +>>> input_ids = torch.tensor([[7592]]) +>>> output = model(input_ids) +>>> print(output.logits) +tensor([[-0.1008, -0.4061]], grad_fn=) +``` + +يجب عليك في معظم الوقت توفير `attention_mask` للنموذج لتجاهل رموز الحشو لتجنب هذا الخطأ الصامت. الآن يتطابق مُخرجات التسلسل الثاني مع مُخرجاته الفعلية: + + +بشكل افتراضي، ينشئ مجزىء النصوص `attention_mask` لك استنادًا إلى إعدادات المجزىء المحدد. + + +```python +>>> attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0]]) +>>> output = model(input_ids, attention_mask=attention_mask) +>>> print(output.logits) +tensor([[ 0.0082, -0.2307], +[-0.1008, -0.4061]], grad_fn=) +``` + +لا ينشئ 🤗 Transformers تلقائيًا `attention_mask` لإخفاء رمز الحشو إذا تم توفيره لأن: + +- بعض النماذج ليس لها رمز حشو. + +- بالنسبة لبعض الاستخدامات، يريد المستخدمون أن ينتبه النموذج إلى رمز الحشو. +## ValueError: فئة التكوين غير المعترف بها XYZ لهذا النوع من AutoModel + +بشكل عام، نوصي باستخدام فئة [`AutoModel`] لتحميل النسخ المدربة مسبقًا من النماذج. يمكن لهذه الفئة أن تستنتج وتُحمل تلقائيًا البنية الصحيحة من نسخ معينة بناءً على التكوين. إذا رأيت هذا الخطأ `ValueError` عند تحميل نموذج من نسخة، فهذا يعني أن الفئة التلقائية (Auto) لم تتمكن من العثور على خريطة من التكوين في نقطة التفتيش المعطاة إلى نوع النموذج الذي تُحاول تحميله. وغالبًا ما يحدث هذا عندما لا تدعم نقطة التفتيش مهمة معينة. + +على سبيل المثال، سترى هذا الخطأ في المثال التالي لأنه لا يوجد GPT2 للإجابة على الأسئلة: + +```py +>>> from transformers import AutoProcessor, AutoModelForQuestionAnswering + +>>> processor = AutoProcessor.from_pretrained("openai-community/gpt2-medium") +>>> model = AutoModelForQuestionAnswering.from_pretrained("openai-community/gpt2-medium") +ValueError: Unrecognized configuration class for this kind of AutoModel: AutoModelForQuestionAnswering. +Model type should be one of AlbertConfig, BartConfig, BertConfig, BigBirdConfig, BigBirdPegasusConfig, BloomConfig, ... +``` diff --git a/docs/source/de/installation.md b/docs/source/de/installation.md index 1bd34f73302b27..44b6f1ed981e1e 100644 --- a/docs/source/de/installation.md +++ b/docs/source/de/installation.md @@ -149,7 +149,7 @@ conda install conda-forge::transformers Vorgefertigte Modelle werden heruntergeladen und lokal zwischengespeichert unter: `~/.cache/huggingface/hub`. Dies ist das Standardverzeichnis, das durch die Shell-Umgebungsvariable "TRANSFORMERS_CACHE" vorgegeben ist. Unter Windows wird das Standardverzeichnis durch `C:\Benutzer\Benutzername\.cache\huggingface\hub` angegeben. Sie können die unten aufgeführten Shell-Umgebungsvariablen - in der Reihenfolge ihrer Priorität - ändern, um ein anderes Cache-Verzeichnis anzugeben: -1. Shell-Umgebungsvariable (Standard): `HUGGINGFACE_HUB_CACHE` oder `TRANSFORMERS_CACHE`. +1. Shell-Umgebungsvariable (Standard): `HF_HUB_CACHE` oder `TRANSFORMERS_CACHE`. 2. Shell-Umgebungsvariable: `HF_HOME`. 3. Shell-Umgebungsvariable: `XDG_CACHE_HOME` + `/huggingface`. diff --git a/docs/source/de/model_sharing.md b/docs/source/de/model_sharing.md index 6bbb6e10cb4942..850d9a3454a9c1 100644 --- a/docs/source/de/model_sharing.md +++ b/docs/source/de/model_sharing.md @@ -43,7 +43,7 @@ Folglich können Sie eine bestimmte Modellversion mit dem Parameter "Revision" l ```py >>> model = AutoModel.from_pretrained( -... "julien-c/EsperBERTo-small", revision="v2.0.1" # tag name, or branch name, or commit hash +... "julien-c/EsperBERTo-small", revision="4c77982" # tag name, or branch name, or commit hash ... ) ``` diff --git a/docs/source/de/quicktour.md b/docs/source/de/quicktour.md index 01cd7200750c4d..c01609207fec2a 100644 --- a/docs/source/de/quicktour.md +++ b/docs/source/de/quicktour.md @@ -109,7 +109,7 @@ label: NEGATIVE, with score: 0.5309 Die [`pipeline`] kann auch über einen ganzen Datensatz iterieren. Starten wir mit der Installation der [🤗 Datasets](https://huggingface.co/docs/datasets/) Bibliothek: ```bash -pip install datasets +pip install datasets ``` Erstellen wir eine [`pipeline`] mit der Aufgabe die wir lösen und dem Modell welches wir nutzen möchten. @@ -191,7 +191,7 @@ Wenn Sie kein Modell für Ihren Anwendungsfall finden können, müssen Sie ein v -Unter der Haube arbeiten die Klassen [`AutoModelForSequenceClassification`] und [`AutoTokenizer`] zusammen, um die [`pipeline`] zu betreiben. Eine [`AutoClass`](./model_doc/auto) ist eine Abkürzung, die automatisch die Architektur eines trainierten Modells aus dessen Namen oder Pfad abruft. Sie müssen nur die passende `AutoClass` für Ihre Aufgabe und den zugehörigen Tokenizer mit [`AutoTokenizer`] auswählen. +Unter der Haube arbeiten die Klassen [`AutoModelForSequenceClassification`] und [`AutoTokenizer`] zusammen, um die [`pipeline`] zu betreiben. Eine [`AutoClass`](./model_doc/auto) ist eine Abkürzung, die automatisch die Architektur eines trainierten Modells aus dessen Namen oder Pfad abruft. Sie müssen nur die passende `AutoClass` für Ihre Aufgabe und den zugehörigen Tokenizer mit [`AutoTokenizer`] auswählen. Kehren wir zu unserem Beispiel zurück und sehen wir uns an, wie Sie die `AutoClass` verwenden können, um die Ergebnisse der [`pipeline`] zu replizieren. @@ -281,7 +281,7 @@ Jetzt können Sie Ihren vorverarbeiteten Stapel von Eingaben direkt an das Model ``` Das Modell gibt die endgültigen Aktivierungen in dem Attribut "logits" aus. Wenden Sie die Softmax-Funktion auf die "logits" an, um die Wahrscheinlichkeiten zu erhalten: - + ```py >>> from torch import nn @@ -308,7 +308,7 @@ In der [Aufgabenzusammenfassung](./task_summary) steht, welche [AutoModel]-Klass Jetzt können Sie Ihren vorverarbeiteten Stapel von Eingaben direkt an das Modell übergeben, indem Sie die Wörterbuchschlüssel direkt an die Tensoren übergeben: - + ```py >>> tf_outputs = tf_model(tf_batch) ``` @@ -383,8 +383,8 @@ Ein besonders cooles 🤗 Transformers-Feature ist die Möglichkeit, ein Modell ```py >>> from transformers import AutoModel ->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory) ->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True) +>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory) +>>> pt_model = AutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True) ``` @@ -392,8 +392,8 @@ Ein besonders cooles 🤗 Transformers-Feature ist die Möglichkeit, ein Modell ```py >>> from transformers import TFAutoModel ->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory) ->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True) +>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory) +>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True) ``` diff --git a/docs/source/en/_config.py b/docs/source/en/_config.py index 4381def017ddc5..f49e4e4731965a 100644 --- a/docs/source/en/_config.py +++ b/docs/source/en/_config.py @@ -11,4 +11,4 @@ "{processor_class}": "FakeProcessorClass", "{model_class}": "FakeModelClass", "{object_class}": "FakeObjectClass", -} \ No newline at end of file +} diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index a5e11c05ebf372..c512a59550e7f5 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -167,6 +167,8 @@ title: AWQ - local: quantization/aqlm title: AQLM + - local: quantization/vptq + title: VPTQ - local: quantization/quanto title: Quanto - local: quantization/eetq @@ -218,6 +220,8 @@ title: CPU inference - local: perf_infer_gpu_one title: GPU inference + - local: perf_infer_gpu_multi + title: Multi-GPU inference title: Optimizing inference - local: big_models title: Instantiate a big model @@ -320,6 +324,8 @@ sections: - local: model_doc/albert title: ALBERT + - local: model_doc/bamba + title: Bamba - local: model_doc/bart title: BART - local: model_doc/barthez @@ -360,6 +366,8 @@ title: CodeLlama - local: model_doc/cohere title: Cohere + - local: model_doc/cohere2 + title: Cohere2 - local: model_doc/convbert title: ConvBERT - local: model_doc/cpm @@ -392,6 +400,8 @@ title: ESM - local: model_doc/falcon title: Falcon + - local: model_doc/falcon3 + title: Falcon3 - local: model_doc/falcon_mamba title: FalconMamba - local: model_doc/fastspeech2_conformer @@ -490,6 +500,8 @@ title: mLUKE - local: model_doc/mobilebert title: MobileBERT + - local: model_doc/modernbert + title: ModernBert - local: model_doc/mpnet title: MPNet - local: model_doc/mpt @@ -514,6 +526,8 @@ title: Nyströmformer - local: model_doc/olmo title: OLMo + - local: model_doc/olmo2 + title: OLMo2 - local: model_doc/olmoe title: OLMoE - local: model_doc/open-llama @@ -655,6 +669,8 @@ title: GLPN - local: model_doc/hiera title: Hiera + - local: model_doc/ijepa + title: I-JEPA - local: model_doc/imagegpt title: ImageGPT - local: model_doc/levit @@ -701,6 +717,8 @@ title: Swin2SR - local: model_doc/table-transformer title: Table Transformer + - local: model_doc/timm_wrapper + title: Timm Wrapper - local: model_doc/upernet title: UperNet - local: model_doc/van @@ -806,6 +824,8 @@ title: ALIGN - local: model_doc/altclip title: AltCLIP + - local: model_doc/aria + title: Aria - local: model_doc/blip title: BLIP - local: model_doc/blip-2 @@ -824,6 +844,8 @@ title: CLIPSeg - local: model_doc/clvp title: CLVP + - local: model_doc/colpali + title: ColPali - local: model_doc/data2vec title: Data2Vec - local: model_doc/deplot diff --git a/docs/source/en/add_new_pipeline.md b/docs/source/en/add_new_pipeline.md index 1e5b95e9b48cfc..e8234c565b26c8 100644 --- a/docs/source/en/add_new_pipeline.md +++ b/docs/source/en/add_new_pipeline.md @@ -184,7 +184,7 @@ class PairClassificationPipeline(Pipeline): ``` The implementation is framework agnostic, and will work for PyTorch and TensorFlow models. If we have saved this in -a file named `pair_classification.py`, we can then import it and register it like this: +a file named `pair_classification.py`, we can then import it and register it like this. ```py from pair_classification import PairClassificationPipeline @@ -199,6 +199,22 @@ PIPELINE_REGISTRY.register_pipeline( ) ``` +The [register_pipeline](https://github.com/huggingface/transformers/blob/9feae5fb0164e89d4998e5776897c16f7330d3df/src/transformers/pipelines/base.py#L1387) function registers the pipeline details (task type, pipeline class, supported backends) to a models `config.json` file. + +```json + "custom_pipelines": { + "pair-classification": { + "impl": "pair_classification.PairClassificationPipeline", + "pt": [ + "AutoModelForSequenceClassification" + ], + "tf": [ + "TFAutoModelForSequenceClassification" + ], + } + }, +``` + Once this is done, we can use it with a pretrained model. For instance `sgugger/finetuned-bert-mrpc` has been fine-tuned on the MRPC dataset, which classifies pairs of sentences as paraphrases or not. diff --git a/docs/source/en/agents.md b/docs/source/en/agents.md index 721e348f89fe22..56c9184980f4b2 100644 --- a/docs/source/en/agents.md +++ b/docs/source/en/agents.md @@ -225,7 +225,7 @@ You have access to the following tools: To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences. At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task, then the tools that you want to use. -Then in the 'Code:' sequence, you shold write the code in simple Python. The code sequence must end with '/End code' sequence. +Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '/End code' sequence. During each intermediate step, you can use 'print()' to save whatever important information you will then need. These print outputs will then be available in the 'Observation:' field, for using this information as input for the next step. diff --git a/docs/source/en/agents_advanced.md b/docs/source/en/agents_advanced.md index ddcc619b4f91f6..c4753bf1366b09 100644 --- a/docs/source/en/agents_advanced.md +++ b/docs/source/en/agents_advanced.md @@ -123,52 +123,70 @@ from transformers import load_tool, CodeAgent model_download_tool = load_tool("m-ric/hf-model-downloads") ``` -### Use gradio-tools +### Import a Space as a tool 🚀 -[gradio-tools](https://github.com/freddyaboulton/gradio-tools) is a powerful library that allows using Hugging -Face Spaces as tools. It supports many existing Spaces as well as custom Spaces. +You can directly import a Space from the Hub as a tool using the [`Tool.from_space`] method! -Transformers supports `gradio_tools` with the [`Tool.from_gradio`] method. For example, let's use the [`StableDiffusionPromptGeneratorTool`](https://github.com/freddyaboulton/gradio-tools/blob/main/gradio_tools/tools/prompt_generator.py) from `gradio-tools` toolkit for improving prompts to generate better images. +You only need to provide the id of the Space on the Hub, its name, and a description that will help you agent understand what the tool does. Under the hood, this will use [`gradio-client`](https://pypi.org/project/gradio-client/) library to call the Space. -Import and instantiate the tool, then pass it to the `Tool.from_gradio` method: +For instance, let's import the [FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev) Space from the Hub and use it to generate an image. -```python -from gradio_tools import StableDiffusionPromptGeneratorTool -from transformers import Tool, load_tool, CodeAgent +``` +from transformers import Tool -gradio_prompt_generator_tool = StableDiffusionPromptGeneratorTool() -prompt_generator_tool = Tool.from_gradio(gradio_prompt_generator_tool) +image_generation_tool = Tool.from_space( + "black-forest-labs/FLUX.1-dev", + name="image_generator", + description="Generate an image from a prompt") + +image_generation_tool("A sunny beach") ``` +And voilà, here's your image! 🏖️ -Now you can use it just like any other tool. For example, let's improve the prompt `a rabbit wearing a space suit`. + + +Then you can use this tool just like any other tool. For example, let's improve the prompt `a rabbit wearing a space suit` and generate an image of it. ```python -image_generation_tool = load_tool('huggingface-tools/text-to-image') -agent = CodeAgent(tools=[prompt_generator_tool, image_generation_tool], llm_engine=llm_engine) +from transformers import ReactCodeAgent + +agent = ReactCodeAgent(tools=[image_generation_tool]) agent.run( "Improve this prompt, then generate an image of it.", prompt='A rabbit wearing a space suit' ) ``` -The model adequately leverages the tool: ```text -======== New task ======== -Improve this prompt, then generate an image of it. -You have been provided with these initial arguments: {'prompt': 'A rabbit wearing a space suit'}. -==== Agent is executing the code below: -improved_prompt = StableDiffusionPromptGenerator(query=prompt) -while improved_prompt == "QUEUE_FULL": - improved_prompt = StableDiffusionPromptGenerator(query=prompt) -print(f"The improved prompt is {improved_prompt}.") -image = image_generator(prompt=improved_prompt) -==== +=== Agent thoughts: +improved_prompt could be "A bright blue space suit wearing rabbit, on the surface of the moon, under a bright orange sunset, with the Earth visible in the background" + +Now that I have improved the prompt, I can use the image generator tool to generate an image based on this prompt. +>>> Agent is executing the code below: +image = image_generator(prompt="A bright blue space suit wearing rabbit, on the surface of the moon, under a bright orange sunset, with the Earth visible in the background") +final_answer(image) ``` -Before finally generating the image: + - +How cool is this? 🤩 +### Use gradio-tools + +[gradio-tools](https://github.com/freddyaboulton/gradio-tools) is a powerful library that allows using Hugging +Face Spaces as tools. It supports many existing Spaces as well as custom Spaces. + +Transformers supports `gradio_tools` with the [`Tool.from_gradio`] method. For example, let's use the [`StableDiffusionPromptGeneratorTool`](https://github.com/freddyaboulton/gradio-tools/blob/main/gradio_tools/tools/prompt_generator.py) from `gradio-tools` toolkit for improving prompts to generate better images. + +Import and instantiate the tool, then pass it to the `Tool.from_gradio` method: + +```python +from gradio_tools import StableDiffusionPromptGeneratorTool +from transformers import Tool, load_tool, CodeAgent + +gradio_prompt_generator_tool = StableDiffusionPromptGeneratorTool() +prompt_generator_tool = Tool.from_gradio(gradio_prompt_generator_tool) +``` > [!WARNING] > gradio-tools require *textual* inputs and outputs even when working with different modalities like image and audio objects. Image and audio inputs and outputs are currently incompatible. @@ -179,7 +197,7 @@ We love Langchain and think it has a very compelling suite of tools. To import a tool from LangChain, use the `from_langchain()` method. Here is how you can use it to recreate the intro's search result using a LangChain web search tool. - +This tool will need `pip install google-search-results` to work properly. ```python from langchain.agents import load_tools from transformers import Tool, ReactCodeAgent @@ -188,12 +206,12 @@ search_tool = Tool.from_langchain(load_tools(["serpapi"])[0]) agent = ReactCodeAgent(tools=[search_tool]) -agent.run("How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?") +agent.run("How many more blocks (also denoted as layers) are in BERT base encoder compared to the encoder from the architecture proposed in Attention is All You Need?") ``` ## Display your agent run in a cool Gradio interface -You can leverage `gradio.Chatbot`to display your agent's thoughts using `stream_to_gradio`, here is an example: +You can leverage `gradio.Chatbot` to display your agent's thoughts using `stream_to_gradio`, here is an example: ```py import gradio as gr diff --git a/docs/source/en/autoclass_tutorial.md b/docs/source/en/autoclass_tutorial.md index 0f02f19ed29534..33f48b2b043fec 100644 --- a/docs/source/en/autoclass_tutorial.md +++ b/docs/source/en/autoclass_tutorial.md @@ -138,12 +138,15 @@ Load a processor with [`AutoProcessor.from_pretrained`]: -The `AutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`AutoModelForSequenceClassification.from_pretrained`]: +The `AutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`AutoModelForSequenceClassification.from_pretrained`]. + +> [!WARNING] +> By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type. ```py >>> from transformers import AutoModelForSequenceClassification ->>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") +>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype="auto") ``` Easily reuse the same checkpoint to load an architecture for a different task: @@ -151,7 +154,7 @@ Easily reuse the same checkpoint to load an architecture for a different task: ```py >>> from transformers import AutoModelForTokenClassification ->>> model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased") +>>> model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype="auto") ``` diff --git a/docs/source/en/chat_templating.md b/docs/source/en/chat_templating.md index 1bdf05a26c8d08..0108cb48e95cee 100644 --- a/docs/source/en/chat_templating.md +++ b/docs/source/en/chat_templating.md @@ -683,7 +683,7 @@ one is a little simplified from the actual one! ``` {%- for message in messages %} - {{- '<|' + message['role'] + |>\n' }} + {{- '<|' + message['role'] + '|>\n' }} {{- message['content'] + eos_token }} {%- endfor %} {%- if add_generation_prompt %} @@ -1116,4 +1116,4 @@ name to be included in the tool response, then rendering it can be as simple as: ``` Again, remember that the actual formatting and special tokens are model-specific - you should take a lot of care -to ensure that tokens, whitespace and everything else exactly match the format your model was trained with! \ No newline at end of file +to ensure that tokens, whitespace and everything else exactly match the format your model was trained with! diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md index 64ded9613716a5..47032a2a292b1b 100644 --- a/docs/source/en/generation_strategies.md +++ b/docs/source/en/generation_strategies.md @@ -403,7 +403,7 @@ culture, and they allow us to design the' This guide illustrates the main parameters that enable various decoding strategies. More advanced parameters exist for the [`generate`] method, which gives you even further control over the [`generate`] method's behavior. -For the complete list of the available parameters, refer to the [API documentation](./main_classes/text_generation.md). +For the complete list of the available parameters, refer to the [API documentation](./main_classes/text_generation). ### Speculative Decoding @@ -416,16 +416,6 @@ Assisted decoding assumes the main and assistant models have the same tokenizer, Currently, only greedy search and sampling are supported with assisted decoding, and assisted decoding doesn't support batched inputs. To learn more about assisted decoding, check [this blog post](https://huggingface.co/blog/assisted-generation). -#### Universal Assisted Decoding - -Universal Assisted Decoding (UAD) adds support for main and assistant models with different tokenizers. -To use it, simply pass the tokenizers using the `tokenizer` and `assistant_tokenizer` arguments (see below). -Internally, the main model input tokens are re-encoded into assistant model tokens, then candidate tokens are generated in the assistant encoding, which are -in turn re-encoded into main model candidate tokens. Validation then proceeds as explained above. -The re-encoding steps involve decoding token ids into text and then encoding the text using a different tokenizer. -Since re-encoding the tokens may result in tokenization discrepancies, UAD finds the longest common subsequence between the source and target encodings, -to ensure the new tokens include the correct prompt suffix. - To enable assisted decoding, set the `assistant_model` argument with a model. ```python @@ -445,7 +435,38 @@ To enable assisted decoding, set the `assistant_model` argument with a model. ['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a'] ``` -If the main and assistant models have different tokenizers, use Universal Assisted Decoding. +When using assisted decoding with sampling methods, you can use the `temperature` argument to control the randomness, +just like in multinomial sampling. However, in assisted decoding, reducing the temperature may help improve the latency. + +```python +>>> from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed +>>> set_seed(42) # For reproducibility + +>>> prompt = "Alice and Bob" +>>> checkpoint = "EleutherAI/pythia-1.4b-deduped" +>>> assistant_checkpoint = "EleutherAI/pythia-160m-deduped" + +>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) +>>> inputs = tokenizer(prompt, return_tensors="pt") + +>>> model = AutoModelForCausalLM.from_pretrained(checkpoint) +>>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint) +>>> outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5) +>>> tokenizer.batch_decode(outputs, skip_special_tokens=True) +['Alice and Bob, a couple of friends of mine, who are both in the same office as'] +``` + +We recommend to install `scikit-learn` library to enhance the candidate generation strategy and achieve additional speedup. + +#### Universal Assisted Decoding + +Universal Assisted Decoding (UAD) adds support for main and assistant models with different tokenizers. +To use it, simply pass the tokenizers using the `tokenizer` and `assistant_tokenizer` arguments (see below). +Internally, the main model input tokens are re-encoded into assistant model tokens, then candidate tokens are generated in the assistant encoding, which are +in turn re-encoded into main model candidate tokens. Validation then proceeds as explained above. +The re-encoding steps involve decoding token ids into text and then encoding the text using a different tokenizer. +Since re-encoding the tokens may result in tokenization discrepancies, UAD finds the longest common subsequence between the source and target encodings, +to ensure the new tokens include the correct prompt suffix. ```python >>> from transformers import AutoModelForCausalLM, AutoTokenizer @@ -465,30 +486,35 @@ If the main and assistant models have different tokenizers, use Universal Assist ['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a'] ``` -When using assisted decoding with sampling methods, you can use the `temperature` argument to control the randomness, -just like in multinomial sampling. However, in assisted decoding, reducing the temperature may help improve the latency. +#### Prompt Lookup + +Alternatively, you can also set the `prompt_lookup_num_tokens` to trigger n-gram based assisted decoding, as opposed +to model based assisted decoding. You can read more about it [here](https://twitter.com/joao_gante/status/1747322413006643259). + +#### Self-Speculative Decoding + +An LLM can be trained to also use its language modeling head with earlier hidden states as input, effectively +skipping layers to yield a lower-quality output -- a technique called early exiting. +We use the lower-quality early exit output as an assistant output, and apply self-speculation to fix the output using the remaining layers. The final generation of that self-speculative solution is the same (or has the same distribution) as the original model's generation. +If the model you're using was trained to do early exit, you can pass +`assistant_early_exit` (integer). In this case, the assistant model will be the same model but exiting early, hence the +"self-speculative" name. Because the assistant model is a portion of the target model, caches and weights can be shared, which results in lower memory requirements. As in other assisted generation methods, the final generated result has the same quality as if no assistant had been used. ```python ->>> from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed ->>> set_seed(42) # For reproducibility +>>> from transformers import AutoModelForCausalLM, AutoTokenizer >>> prompt = "Alice and Bob" ->>> checkpoint = "EleutherAI/pythia-1.4b-deduped" ->>> assistant_checkpoint = "EleutherAI/pythia-160m-deduped" +>>> checkpoint = "facebook/layerskip-llama3.2-1B" >>> tokenizer = AutoTokenizer.from_pretrained(checkpoint) >>> inputs = tokenizer(prompt, return_tensors="pt") >>> model = AutoModelForCausalLM.from_pretrained(checkpoint) ->>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint) ->>> outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5) +>>> outputs = model.generate(**inputs, assistant_early_exit=4, do_sample=False, max_new_tokens=20) >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) -['Alice and Bob, a couple of friends of mine, who are both in the same office as'] +['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a'] ``` -Alternatively, you can also set the `prompt_lookup_num_tokens` to trigger n-gram based assisted decoding, as opposed -to model based assisted decoding. You can read more about it [here](https://twitter.com/joao_gante/status/1747322413006643259). - ### DoLa Decoding **D**ecoding by C**o**ntrasting **La**yers (DoLa) is a contrastive decoding strategy to improve the factuality and reduce the @@ -508,10 +534,11 @@ See the following examples for DoLa decoding with the 32-layer LLaMA-7B model. ```python >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed >>> import torch +>>> from accelerate.test_utils.testing import get_backend >>> tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b") >>> model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", torch_dtype=torch.float16) ->>> device = 'cuda' if torch.cuda.is_available() else 'cpu' +>>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) >>> model.to(device) >>> set_seed(42) diff --git a/docs/source/en/gguf.md b/docs/source/en/gguf.md index 2da721b28986af..b1ed1f0d492ab9 100644 --- a/docs/source/en/gguf.md +++ b/docs/source/en/gguf.md @@ -87,6 +87,7 @@ For now the supported model architectures are the architectures that have been v - Starcoder2 - T5 - Mamba +- Nemotron ## Example usage diff --git a/docs/source/en/index.md b/docs/source/en/index.md index d3418c7012e947..345abcc875dd46 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -62,8 +62,11 @@ Flax), PyTorch, and/or TensorFlow. | [ALBERT](model_doc/albert) | ✅ | ✅ | ✅ | | [ALIGN](model_doc/align) | ✅ | ❌ | ❌ | | [AltCLIP](model_doc/altclip) | ✅ | ❌ | ❌ | +| [Aria](model_doc/aria) | ✅ | ❌ | ❌ | +| [AriaText](model_doc/aria_text) | ✅ | ❌ | ❌ | | [Audio Spectrogram Transformer](model_doc/audio-spectrogram-transformer) | ✅ | ❌ | ❌ | | [Autoformer](model_doc/autoformer) | ✅ | ❌ | ❌ | +| [Bamba](model_doc/bamba) | ✅ | ❌ | ❌ | | [Bark](model_doc/bark) | ✅ | ❌ | ❌ | | [BART](model_doc/bart) | ✅ | ✅ | ✅ | | [BARThez](model_doc/barthez) | ✅ | ✅ | ✅ | @@ -97,6 +100,8 @@ Flax), PyTorch, and/or TensorFlow. | [CodeGen](model_doc/codegen) | ✅ | ❌ | ❌ | | [CodeLlama](model_doc/code_llama) | ✅ | ❌ | ✅ | | [Cohere](model_doc/cohere) | ✅ | ❌ | ❌ | +| [Cohere2](model_doc/cohere2) | ✅ | ❌ | ❌ | +| [ColPali](model_doc/colpali) | ✅ | ❌ | ❌ | | [Conditional DETR](model_doc/conditional_detr) | ✅ | ❌ | ❌ | | [ConvBERT](model_doc/convbert) | ✅ | ✅ | ❌ | | [ConvNeXT](model_doc/convnext) | ✅ | ✅ | ❌ | @@ -138,6 +143,7 @@ Flax), PyTorch, and/or TensorFlow. | [ESM](model_doc/esm) | ✅ | ✅ | ❌ | | [FairSeq Machine-Translation](model_doc/fsmt) | ✅ | ❌ | ❌ | | [Falcon](model_doc/falcon) | ✅ | ❌ | ❌ | +| [Falcon3](model_doc/falcon3) | ✅ | ❌ | ✅ | | [FalconMamba](model_doc/falcon_mamba) | ✅ | ❌ | ❌ | | [FastSpeech2Conformer](model_doc/fastspeech2_conformer) | ✅ | ❌ | ❌ | | [FLAN-T5](model_doc/flan-t5) | ✅ | ✅ | ✅ | @@ -169,9 +175,11 @@ Flax), PyTorch, and/or TensorFlow. | [Hiera](model_doc/hiera) | ✅ | ❌ | ❌ | | [Hubert](model_doc/hubert) | ✅ | ✅ | ❌ | | [I-BERT](model_doc/ibert) | ✅ | ❌ | ❌ | +| [I-JEPA](model_doc/ijepa) | ✅ | ❌ | ❌ | | [IDEFICS](model_doc/idefics) | ✅ | ✅ | ❌ | | [Idefics2](model_doc/idefics2) | ✅ | ❌ | ❌ | | [Idefics3](model_doc/idefics3) | ✅ | ❌ | ❌ | +| [Idefics3VisionTransformer](model_doc/idefics3_vision) | ❌ | ❌ | ❌ | | [ImageGPT](model_doc/imagegpt) | ✅ | ❌ | ❌ | | [Informer](model_doc/informer) | ✅ | ❌ | ❌ | | [InstructBLIP](model_doc/instructblip) | ✅ | ❌ | ❌ | @@ -225,6 +233,7 @@ Flax), PyTorch, and/or TensorFlow. | [MobileNetV2](model_doc/mobilenet_v2) | ✅ | ❌ | ❌ | | [MobileViT](model_doc/mobilevit) | ✅ | ✅ | ❌ | | [MobileViTV2](model_doc/mobilevitv2) | ✅ | ❌ | ❌ | +| [ModernBERT](model_doc/modernbert) | ✅ | ❌ | ❌ | | [Moshi](model_doc/moshi) | ✅ | ❌ | ❌ | | [MPNet](model_doc/mpnet) | ✅ | ✅ | ❌ | | [MPT](model_doc/mpt) | ✅ | ❌ | ❌ | @@ -241,6 +250,7 @@ Flax), PyTorch, and/or TensorFlow. | [Nougat](model_doc/nougat) | ✅ | ✅ | ✅ | | [Nyströmformer](model_doc/nystromformer) | ✅ | ❌ | ❌ | | [OLMo](model_doc/olmo) | ✅ | ❌ | ❌ | +| [OLMo2](model_doc/olmo2) | ✅ | ❌ | ❌ | | [OLMoE](model_doc/olmoe) | ✅ | ❌ | ❌ | | [OmDet-Turbo](model_doc/omdet-turbo) | ✅ | ❌ | ❌ | | [OneFormer](model_doc/oneformer) | ✅ | ❌ | ❌ | @@ -317,6 +327,7 @@ Flax), PyTorch, and/or TensorFlow. | [TAPEX](model_doc/tapex) | ✅ | ✅ | ✅ | | [Time Series Transformer](model_doc/time_series_transformer) | ✅ | ❌ | ❌ | | [TimeSformer](model_doc/timesformer) | ✅ | ❌ | ❌ | +| [TimmWrapperModel](model_doc/timm_wrapper) | ✅ | ❌ | ❌ | | [Trajectory Transformer](model_doc/trajectory_transformer) | ✅ | ❌ | ❌ | | [Transformer-XL](model_doc/transfo-xl) | ✅ | ✅ | ❌ | | [TrOCR](model_doc/trocr) | ✅ | ❌ | ❌ | diff --git a/docs/source/en/installation.md b/docs/source/en/installation.md index f4ce768c3168e9..af7c97ef3508ae 100644 --- a/docs/source/en/installation.md +++ b/docs/source/en/installation.md @@ -157,7 +157,7 @@ conda install conda-forge::transformers Pretrained models are downloaded and locally cached at: `~/.cache/huggingface/hub`. This is the default directory given by the shell environment variable `TRANSFORMERS_CACHE`. On Windows, the default directory is given by `C:\Users\username\.cache\huggingface\hub`. You can change the shell environment variables shown below - in order of priority - to specify a different cache directory: -1. Shell environment variable (default): `HUGGINGFACE_HUB_CACHE` or `TRANSFORMERS_CACHE`. +1. Shell environment variable (default): `HF_HUB_CACHE` or `TRANSFORMERS_CACHE`. 2. Shell environment variable: `HF_HOME`. 3. Shell environment variable: `XDG_CACHE_HOME` + `/huggingface`. diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md index eb25ddb6329755..d8931342ee45f8 100644 --- a/docs/source/en/internal/generation_utils.md +++ b/docs/source/en/internal/generation_utils.md @@ -352,6 +352,8 @@ A [`Constraint`] can be used to force the generation to include specific tokens [[autodoc]] TextIteratorStreamer +[[autodoc]] AsyncTextIteratorStreamer + ## Caches [[autodoc]] Cache @@ -436,3 +438,9 @@ A [`Constraint`] can be used to force the generation to include specific tokens [[autodoc]] SynthIDTextWatermarkDetector - __call__ + +## Compile Utils + +[[autodoc]] CompileConfig + - __call__ + diff --git a/docs/source/en/kv_cache.md b/docs/source/en/kv_cache.md index 05ab9eafa72349..b1d1e0998f06ed 100644 --- a/docs/source/en/kv_cache.md +++ b/docs/source/en/kv_cache.md @@ -180,7 +180,7 @@ Fun fact: The shortest war in history was between Britain and Zanzibar on August -Cache offloading requires a GPU and can be slower than dynamic KV cache. Use it if you are getting CUDA out of memory errors. +Cache offloading requires a CUDA GPU and can be slower than dynamic KV cache. Use it if you are getting CUDA out of memory errors. @@ -261,6 +261,7 @@ This will use the [`~OffloadedStaticCache`] implementation instead. >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0] "Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of" ``` +Cache offloading requires a CUDA GPU. ### Sliding Window Cache diff --git a/docs/source/en/llm_optims.md b/docs/source/en/llm_optims.md index 0a6a7e15bea081..17ebb841de7a39 100644 --- a/docs/source/en/llm_optims.md +++ b/docs/source/en/llm_optims.md @@ -57,13 +57,13 @@ import os os.environ["TOKENIZERS_PARALLELISM"] = "false" # To prevent long warnings :) tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b") -model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto") +model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", torch_dtype="auto", device_map="auto") model.generation_config.cache_implementation = "static" model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True) input_text = "The theory of special relativity states " -input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") +input_ids = tokenizer(input_text, return_tensors="pt").to(model.device.type) outputs = model.generate(**input_ids) print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) @@ -89,11 +89,11 @@ import os os.environ["TOKENIZERS_PARALLELISM"] = "false" # To prevent long warnings :) tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b") -model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto") +model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", torch_dtype="auto", device_map="auto") model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True) input_text = "The theory of special relativity states " -input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") +input_ids = tokenizer(input_text, return_tensors="pt").to(model.device.type) prompt_length = input_ids.input_ids.shape[1] model.generation_config.max_new_tokens = 16 @@ -126,6 +126,7 @@ If you want to go further down a level, the [`StaticCache`] object can also be p from transformers import LlamaTokenizer, LlamaForCausalLM, StaticCache, logging from transformers.testing_utils import CaptureLogger import torch +from accelerate.test_utils.testing import get_backend prompts = [ "Simply put, the theory of relativity states that ", @@ -133,7 +134,7 @@ prompts = [ ] NUM_TOKENS_TO_GENERATE = 40 -torch_device = "cuda" +torch_device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", pad_token="", padding_side="right") model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map="sequential") @@ -201,11 +202,11 @@ import os os.environ["TOKENIZERS_PARALLELISM"] = "false" # To prevent long warnings :) tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b") -model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto") +model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", torch_dtype="auto", device_map="auto") model.generate = torch.compile(model.generate, mode="reduce-overhead", fullgraph=True) input_text = "The theory of special relativity states " -input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") +input_ids = tokenizer(input_text, return_tensors="pt").to(model.device.type) outputs = model.generate(**input_ids) print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) @@ -241,13 +242,14 @@ Enable speculative decoding by loading an assistant model and passing it to the ```py from transformers import AutoModelForCausalLM, AutoTokenizer import torch +from accelerate.test_utils.testing import get_backend -device = "cuda" if torch.cuda.is_available() else "cpu" +device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b") inputs = tokenizer("Einstein's theory of relativity states", return_tensors="pt").to(device) -model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b").to(device) +model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", torch_dtype="auto").to(device) assistant_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m").to(device) outputs = model.generate(**inputs, assistant_model=assistant_model) tokenizer.batch_decode(outputs, skip_special_tokens=True) @@ -262,13 +264,14 @@ For speculative sampling decoding, add the `do_sample` and `temperature` paramet ```py from transformers import AutoModelForCausalLM, AutoTokenizer import torch +from accelerate.test_utils.testing import get_backend -device = "cuda" if torch.cuda.is_available() else "cpu" +device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b") inputs = tokenizer("Einstein's theory of relativity states", return_tensors="pt").to(device) -model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b").to(device) +model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", torch_dtype="auto").to(device) assistant_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m").to(device) outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.7) print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) @@ -290,13 +293,14 @@ To enable prompt lookup decoding, specify the number of tokens that should be ov ```py from transformers import AutoModelForCausalLM, AutoTokenizer import torch +from accelerate.test_utils.testing import get_backend -device = "cuda" if torch.cuda.is_available() else "cpu" +device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b") inputs = tokenizer("The second law of thermodynamics states", return_tensors="pt").to(device) -model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b").to(device) +model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", torch_dtype="auto").to(device) assistant_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m").to(device) outputs = model.generate(**inputs, prompt_lookup_num_tokens=3) print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) @@ -311,13 +315,14 @@ For prompt lookup decoding with sampling, add the `do_sample` and `temperature` ```py from transformers import AutoModelForCausalLM, AutoTokenizer import torch +from accelerate.test_utils.testing import get_backend -device = "cuda" if torch.cuda.is_available() else "cpu" +device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b") inputs = tokenizer("The second law of thermodynamics states", return_tensors="pt").to(device) -model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b").to(device) +model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", torch_dtype="auto").to(device) outputs = model.generate(**inputs, prompt_lookup_num_tokens=3, do_sample=True, temperature=0.7) print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) ["The second law of thermodynamics states that energy cannot be created nor destroyed. It's not a"] @@ -468,7 +473,7 @@ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable Quantization reduces the size of the LLM weights by storing them in a lower precision. This translates to lower memory usage and makes loading LLMs for inference more accessible if you're constrained by your GPUs memory. If you aren't limited by your GPU, you don't necessarily need to quantize your model because it can incur a small latency cost (except for AWQ and fused AWQ modules) due to the extra step required to quantize and dequantize the weights. > [!TIP] -> There are many quantization libraries (see the [Quantization](./quantization) guide for more details) available, such as Quanto, AQLM, AWQ, and AutoGPTQ. Feel free to try them out and see which one works best for your use case. We also recommend reading the [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) blog post which compares AutoGPTQ and bitsandbytes. +> There are many quantization libraries (see the [Quantization](./quantization) guide for more details) available, such as Quanto, AQLM, VPTQ, AWQ, and AutoGPTQ. Feel free to try them out and see which one works best for your use case. We also recommend reading the [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) blog post which compares AutoGPTQ and bitsandbytes. Use the Model Memory Calculator below to estimate and compare how much memory is required to load a model. For example, try estimating how much memory it costs to load [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1). diff --git a/docs/source/en/llm_tutorial_optimization.md b/docs/source/en/llm_tutorial_optimization.md index 9d3d8ad6ba8b86..3414725fc37087 100644 --- a/docs/source/en/llm_tutorial_optimization.md +++ b/docs/source/en/llm_tutorial_optimization.md @@ -147,7 +147,7 @@ Let's call it now for the next experiment. ```python flush() ``` -In the recent version of the accelerate library, you can also use a utility method called `release_memory()` +From the Accelerate library, you can also use a device-agnostic utility method called [release_memory](https://github.com/huggingface/accelerate/blob/29be4788629b772a3b722076e433b5b3b5c85da3/src/accelerate/utils/memory.py#L63), which takes various hardware backends like XPU, MLU, NPU, MPS, and more into account. ```python from accelerate.utils import release_memory diff --git a/docs/source/en/main_classes/image_processor.md b/docs/source/en/main_classes/image_processor.md index 320916f1ce9421..cbf6ae95577f70 100644 --- a/docs/source/en/main_classes/image_processor.md +++ b/docs/source/en/main_classes/image_processor.md @@ -27,6 +27,7 @@ from transformers import AutoImageProcessor processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50", use_fast=True) ``` +Note that `use_fast` will be set to `True` by default in a future release. When using a fast image processor, you can also set the `device` argument to specify the device on which the processing should be done. By default, the processing is done on the same device as the inputs if the inputs are tensors, or on the CPU otherwise. @@ -42,21 +43,17 @@ images_processed = processor(images, return_tensors="pt", device="cuda") Here are some speed comparisons between the base and fast image processors for the `DETR` and `RT-DETR` models, and how they impact overall inference time:
-
- -
-
- -
+ +
+
+
-
- -
-
- -
+ +
+
+
These benchmarks were run on an [AWS EC2 g5.2xlarge instance](https://aws.amazon.com/ec2/instance-types/g5/), utilizing an NVIDIA A10G Tensor Core GPU. diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md index 3f44569697777b..9b500b69374c88 100755 --- a/docs/source/en/main_classes/quantization.md +++ b/docs/source/en/main_classes/quantization.md @@ -34,6 +34,10 @@ Learn how to quantize models in the [Quantization](../quantization) guide. [[autodoc]] AqlmConfig +## VptqConfig + +[[autodoc]] VptqConfig + ## AwqConfig [[autodoc]] AwqConfig diff --git a/docs/source/en/main_classes/tokenizer.md b/docs/source/en/main_classes/tokenizer.md index 2ad7e450404e77..83d2ae5df6a7fb 100644 --- a/docs/source/en/main_classes/tokenizer.md +++ b/docs/source/en/main_classes/tokenizer.md @@ -51,6 +51,25 @@ token space (e.g., getting the index of the token comprising a given character o to a given token). +# Multimodal Tokenizer + +Apart from that each tokenizer can be a "multimodal" tokenizer which means that the tokenizer will hold all relevant special tokens +as part of tokenizer attributes for easier access. For example, if the tokenizer is loaded from a vision-language model like LLaVA, you will +be able to access `tokenizer.image_token_id` to obtain the special image token used as a placeholder. + +To enable extra special tokens for any type of tokenizer, you have to add the following lines and save the tokenizer. Extra special tokens do not +have to be modality related and can ne anything that the model often needs access to. In the below code, tokenizer at `output_dir` will have direct access +to three more special tokens. + +```python +vision_tokenizer = AutoTokenizer.from_pretrained( + "llava-hf/llava-1.5-7b-hf", + extra_special_tokens={"image_token": "", "boi_token": "", "eoi_token": ""} +) +print(vision_tokenizer.image_token, vision_tokenizer.image_token_id) +("", 32000) +``` + ## PreTrainedTokenizer [[autodoc]] PreTrainedTokenizer diff --git a/docs/source/en/model_doc/aria.md b/docs/source/en/model_doc/aria.md new file mode 100644 index 00000000000000..9ff7a6687aa939 --- /dev/null +++ b/docs/source/en/model_doc/aria.md @@ -0,0 +1,106 @@ + + +# Aria + +## Overview + +The Aria model was proposed in [Aria: An Open Multimodal Native Mixture-of-Experts Model](https://huggingface.co/papers/2410.05993) by Li et al. from the Rhymes.AI team. + +Aria is an open multimodal-native model with best-in-class performance across a wide range of multimodal, language, and coding tasks. It has a Mixture-of-Experts architecture, with respectively 3.9B and 3.5B activated parameters per visual token and text token. + +The abstract from the paper is the following: + +*Information comes in diverse modalities. Multimodal native AI models are essential to integrate real-world information and deliver comprehensive understanding. While proprietary multimodal native models exist, their lack of openness imposes obstacles for adoptions, let alone adaptations. To fill this gap, we introduce Aria, an open multimodal native model with best-in-class performance across a wide range of multimodal, language, and coding tasks. Aria is a mixture-of-expert model with 3.9B and 3.5B activated parameters per visual token and text token, respectively. It outperforms Pixtral-12B and Llama3.2-11B, and is competitive against the best proprietary models on various multimodal tasks. We pre-train Aria from scratch following a 4-stage pipeline, which progressively equips the model with strong capabilities in language understanding, multimodal understanding, long context window, and instruction following. We open-source the model weights along with a codebase that facilitates easy adoptions and adaptations of Aria in real-world applications.* + +This model was contributed by [m-ric](https://huggingface.co/m-ric). +The original code can be found [here](https://github.com/rhymes-ai/Aria). + +## Usage tips + +Here's how to use the model for vision tasks: +```python +import requests +import torch +from PIL import Image + +from transformers import AriaProcessor, AriaForConditionalGeneration + +model_id_or_path = "rhymes-ai/Aria" + +model = AriaForConditionalGeneration.from_pretrained( + model_id_or_path, device_map="auto" +) + +processor = AriaProcessor.from_pretrained(model_id_or_path) + +image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw) + +messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"text": "what is the image?", "type": "text"}, + ], + } +] + +text = processor.apply_chat_template(messages, add_generation_prompt=True) +inputs = processor(text=text, images=image, return_tensors="pt") +inputs.to(model.device) + +output = model.generate( + **inputs, + max_new_tokens=15, + stop_strings=["<|im_end|>"], + tokenizer=processor.tokenizer, + do_sample=True, + temperature=0.9, +) +output_ids = output[0][inputs["input_ids"].shape[1]:] +response = processor.decode(output_ids, skip_special_tokens=True) +``` + + +## AriaImageProcessor + +[[autodoc]] AriaImageProcessor + +## AriaProcessor + +[[autodoc]] AriaProcessor + +## AriaTextConfig + +[[autodoc]] AriaTextConfig + +## AriaConfig + +[[autodoc]] AriaConfig + +## AriaTextModel + +[[autodoc]] AriaTextModel + +## AriaTextForCausalLM + +[[autodoc]] AriaTextForCausalLM + +## AriaForConditionalGeneration + +[[autodoc]] AriaForConditionalGeneration + - forward diff --git a/docs/source/en/model_doc/bamba.md b/docs/source/en/model_doc/bamba.md new file mode 100644 index 00000000000000..4ea8475edb885a --- /dev/null +++ b/docs/source/en/model_doc/bamba.md @@ -0,0 +1,64 @@ + + +# Bamba + + +## Overview + +Bamba-9B is a decoder-only language model based on the [Mamba-2](https://github.com/state-spaces/mamba) architecture and is designed to handle a wide range of text generation tasks. It is trained from scratch using a two-stage training approach. In the first stage, the model is trained on 2 trillion tokens from the Dolma v1.7 dataset. In the second stage, it undergoes additional training on 200 billion tokens, leveraging a carefully curated blend of high-quality data to further refine its performance and enhance output quality. + +Checkout all Bamba-9B model checkpoints [here](https://github.com/foundation-model-stack/bamba). + +## BambaConfig + +| Model | Params | # Layers | Hidden Dim. | Attention Heads | GQA | KV Heads | Context Length | Tied Embeddings | +|-------------------|--------------|----------|-------------|-----------------|-----|----------|----------------|------------------| +| Bamba | 9B (9.78B) | 32 | 4096 | 32 | Yes | 8 | 4096 | True | + +[[autodoc]] BambaConfig + + + +## BambaForCausalLM + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer + +model = AutoModelForCausalLM.from_pretrained("ibm-fms/Bamba-9B") +tokenizer = AutoTokenizer.from_pretrained("ibm-fms/Bamba-9B") + +message = ["Mamba is a snake with following properties "] +inputs = tokenizer(message, return_tensors='pt', return_token_type_ids=False) +response = model.generate(**inputs, max_new_tokens=64) +print(tokenizer.batch_decode(response, skip_special_tokens=True)[0]) +``` + +[[autodoc]] BambaForCausalLM + - forward + +This HF implementation is contributed by [ani300](https://github.com/ani300) and [fabianlim](https://github.com/fabianlim). diff --git a/docs/source/en/model_doc/beit.md b/docs/source/en/model_doc/beit.md index f7605ebcdf90d4..25b0eafb26a039 100644 --- a/docs/source/en/model_doc/beit.md +++ b/docs/source/en/model_doc/beit.md @@ -71,6 +71,43 @@ alt="drawing" width="600"/> BEiT pre-training. Taken from the original paper. +### Using Scaled Dot Product Attention (SDPA) + +PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function +encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the +[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) +or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) +page for more information. + +SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set +`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. + +``` +from transformers import BeitForImageClassification +model = BeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224", attn_implementation="sdpa", torch_dtype=torch.float16) +... +``` + +For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`). + +On a local benchmark (NVIDIA GeForce RTX 2060-8GB, PyTorch 2.5.1, OS Ubuntu 20.04) with `float16` and +`microsoft/beit-base-patch16-224` model, we saw the following improvements during training and inference: + +#### Training + +| num_training_steps | batch_size | image_size | is_cuda | Time per batch (eager - s) | Time per batch (sdpa - s) | Speedup (%) | Eager peak mem (MB) | SDPA peak mem (MB) | Mem saving (%) | +|--------------------|------------|--------------|---------|----------------------------|---------------------------|-------------|----------------------|--------------------|----------------| +| 50 | 2 | (1048, 640) | True | 0.984 | 0.746 | 31.975 | 6738.915 | 4319.886 | 55.998 | + +#### Inference + +| Image batch size | Eager (s/iter) | Eager CI, % | Eager memory (MB) | SDPA (s/iter) | SDPA CI, % | SDPA memory (MB) | SDPA speedup | SDPA memory saved (%) | +|-------------------:|-----------------:|:--------------|--------------------:|----------------:|:-------------|-------------------:|---------------:|----------------------:| +| 1 | 0.012 | ±0.3% | 3.76657e+08 | 0.011 | ±0.5% | 3.75739e+08 | 1.05 | 0.244 | +| 4 | 0.013 | ±0.1% | 4.03147e+08 | 0.011 | ±0.2% | 3.90554e+08 | 1.178 | 3.225 | +| 16 | 0.045 | ±0.1% | 4.96697e+08 | 0.035 | ±0.1% | 4.51232e+08 | 1.304 | 10.076 | +| 32 | 0.088 | ±0.1% | 6.24417e+08 | 0.066 | ±0.1% | 5.33488e+08 | 1.325 | 17.044 | + ## Resources A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BEiT. diff --git a/docs/source/en/model_doc/blip-2.md b/docs/source/en/model_doc/blip-2.md index b57c69ca6b321b..4125d372d55ad5 100644 --- a/docs/source/en/model_doc/blip-2.md +++ b/docs/source/en/model_doc/blip-2.md @@ -40,6 +40,10 @@ The original code can be found [here](https://github.com/salesforce/LAVIS/tree/5 - BLIP-2 can be used for conditional text generation given an image and an optional text prompt. At inference time, it's recommended to use the [`generate`] method. - One can use [`Blip2Processor`] to prepare images for the model, and decode the predicted tokens ID's back to text. +> [!NOTE] +> BLIP models after release v4.46 will raise warnings about adding `processor.num_query_tokens = {{num_query_tokens}}` and expand model embeddings layer to add special `` token. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. Adding these attributes means that BLIP will add the number of query tokens required per image and expand the text with as many `` placeholders as there will be query tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there wil be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.num_query_tokens` and model embeddings expansion can be done by following [this link](https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042). + ## Resources A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BLIP-2. diff --git a/docs/source/en/model_doc/cohere2.md b/docs/source/en/model_doc/cohere2.md new file mode 100644 index 00000000000000..33e67d48fb0e8b --- /dev/null +++ b/docs/source/en/model_doc/cohere2.md @@ -0,0 +1,51 @@ +# Cohere + +## Overview +[C4AI Command R7B](https://cohere.com/blog/command-r7b) is an open weights research release of a 7B billion parameter model developed by Cohere and Cohere For AI. It has advanced capabilities optimized for various use cases, including reasoning, summarization, question answering, and code. The model is trained to perform sophisticated tasks including Retrieval Augmented Generation (RAG) and tool use. The model also has powerful agentic capabilities that can use and combine multiple tools over multiple steps to accomplish more difficult tasks. It obtains top performance on enterprise-relevant code use cases. C4AI Command R7B is a multilingual model trained on 23 languages. + +The model features three layers with sliding window attention (window size 4096) and ROPE for efficient local context modeling and relative positional encoding. A fourth layer uses global attention without positional embeddings, enabling unrestricted token interactions across the entire sequence. + +The model has been trained on 23 languages: English, French, Spanish, Italian, German, Portuguese, Japanese, Korean, Arabic, Chinese, Russian, Polish, Turkish, Vietnamese, Dutch, Czech, Indonesian, Ukrainian, Romanian, Greek, Hindi, Hebrew, and Persian. + +## Usage tips +The model and tokenizer can be loaded via: + +```python +# pip install transformers +from transformers import AutoTokenizer, AutoModelForCausalLM + +model_id = "CohereForAI/c4ai-command-r7b-12-2024" +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = AutoModelForCausalLM.from_pretrained(model_id) + +# Format message with the command-r chat template +messages = [{"role": "user", "content": "Hello, how are you?"}] +input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt") + +gen_tokens = model.generate( + input_ids, + max_new_tokens=100, + do_sample=True, + temperature=0.3, +) + +gen_text = tokenizer.decode(gen_tokens[0]) +print(gen_text) +``` + +## Cohere2Config + +[[autodoc]] Cohere2Config + +## Cohere2Model + +[[autodoc]] Cohere2Model + - forward + + +## Cohere2ForCausalLM + +[[autodoc]] Cohere2ForCausalLM + - forward + + diff --git a/docs/source/en/model_doc/colpali.md b/docs/source/en/model_doc/colpali.md new file mode 100644 index 00000000000000..3f6b0cbc6613a9 --- /dev/null +++ b/docs/source/en/model_doc/colpali.md @@ -0,0 +1,90 @@ + + +# ColPali + +## Overview + +The *ColPali* model was proposed in [ColPali: Efficient Document Retrieval with Vision Language Models](https://doi.org/10.48550/arXiv.2407.01449) by **Manuel Faysse***, **Hugues Sibille***, **Tony Wu***, Bilel Omrani, Gautier Viaud, Céline Hudelot, Pierre Colombo (* denotes equal contribution). Work lead by ILLUIN Technology. + +In our proposed *ColPali* approach, we leverage VLMs to construct efficient multi-vector embeddings directly from document images (“screenshots”) for document retrieval. We train the model to maximize the similarity between these document embeddings and the corresponding query embeddings, using the late interaction method introduced in ColBERT. + +Using *ColPali* removes the need for potentially complex and brittle layout recognition and OCR pipelines with a single model that can take into account both the textual and visual content (layout, charts, etc.) of a document. + +## Resources + +- The *ColPali* arXiv paper can be found [here](https://doi.org/10.48550/arXiv.2407.01449). 📄 +- The official blog post detailing ColPali can be found [here](https://huggingface.co/blog/manu/colpali). 📝 +- The original model implementation code for the ColPali model and for the `colpali-engine` package can be found [here](https://github.com/illuin-tech/colpali). 🌎 +- Cookbooks for learning to use the transformers-native version of *ColPali*, fine-tuning, and similarity maps generation can be found [here](https://github.com/tonywu71/colpali-cookbooks). 📚 + +This model was contributed by [@tonywu71](https://huggingface.co/tonywu71) and [@yonigozlan](https://huggingface.co/yonigozlan). + +## Usage + +This example demonstrates how to use *ColPali* to embed both queries and images, calculate their similarity scores, and identify the most relevant matches. For a specific query, you can retrieve the top-k most similar images by selecting the ones with the highest similarity scores. + +```python +import torch +from PIL import Image + +from transformers import ColPaliForRetrieval, ColPaliProcessor + +model_name = "vidore/colpali-v1.2-hf" + +model = ColPaliForRetrieval.from_pretrained( + model_name, + torch_dtype=torch.bfloat16, + device_map="cuda:0", # or "mps" if on Apple Silicon +).eval() + +processor = ColPaliProcessor.from_pretrained(model_name) + +# Your inputs (replace dummy images with screenshots of your documents) +images = [ + Image.new("RGB", (32, 32), color="white"), + Image.new("RGB", (16, 16), color="black"), +] +queries = [ + "What is the organizational structure for our R&D department?", + "Can you provide a breakdown of last year’s financial performance?", +] + +# Process the inputs +batch_images = processor(images=images).to(model.device) +batch_queries = processor(text=queries).to(model.device) + +# Forward pass +with torch.no_grad(): + image_embeddings = model(**batch_images).embeddings + query_embeddings = model(**batch_queries).embeddings + +# Score the queries against the images +scores = processor.score_retrieval(query_embeddings, image_embeddings) +``` + +## ColPaliConfig + +[[autodoc]] ColPaliConfig + +## ColPaliProcessor + +[[autodoc]] ColPaliProcessor + +## ColPaliForRetrieval + +[[autodoc]] ColPaliForRetrieval + - forward diff --git a/docs/source/en/model_doc/data2vec.md b/docs/source/en/model_doc/data2vec.md index 517a51ce46a3a4..cb1dc675caa55e 100644 --- a/docs/source/en/model_doc/data2vec.md +++ b/docs/source/en/model_doc/data2vec.md @@ -48,6 +48,46 @@ The original code for vision can be found [here](https://github.com/facebookrese - For Data2VecText, preprocessing is identical to [`RobertaModel`], including tokenization. - For Data2VecVision, preprocessing is identical to [`BeitModel`], including feature extraction. +### Using Scaled Dot Product Attention (SDPA) + +PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function +encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the +[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) +or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) +page for more information. + +SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set +`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. + +The SDPA implementation is currently available for the Data2VecAudio and Data2VecVision models. + +``` +from transformers import Data2VecVisionForImageClassification +model = Data2VecVisionForImageClassification.from_pretrained("facebook/data2vec-vision-base", attn_implementation="sdpa", torch_dtype=torch.float16) +... +``` + +For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`). + +For the Data2VecVision model, on a local benchmark (NVIDIA GeForce RTX 2060-8GB, PyTorch 2.5.1, OS Ubuntu 20.04) +with `float16` and `facebook/data2vec-vision-base` model, we saw the following improvements during training and +inference: + +#### Training + +| num_training_steps | batch_size | image_size | is_cuda | Time per batch (eager - s) | Time per batch (sdpa - s) | Speedup (%) | Eager peak mem (MB) | SDPA peak mem (MB) | Mem saving (%) | +|--------------------|------------|--------------|---------|----------------------------|---------------------------|-------------|----------------------|--------------------|----------------| +| 50 | 2 | (1048, 640) | True | 0.996 | 0.754 | 32.147 | 6722.198 | 4264.653 | 57.626 | + +#### Inference + +| Image batch size | Eager (s/iter) | Eager CI, % | Eager memory (MB) | SDPA (s/iter) | SDPA CI, % | SDPA memory (MB) | SDPA speedup | SDPA memory saved | +|-------------------:|-----------------:|:--------------|--------------------:|----------------:|:-------------|-------------------:|---------------:|--------------------:| +| 1 | 0.011 | ±0.3% | 3.76143e+08 | 0.01 | ±0.3% | 3.74397e+08 | 1.101 | 0.466 | +| 4 | 0.014 | ±0.1% | 4.02756e+08 | 0.012 | ±0.2% | 3.91373e+08 | 1.219 | 2.909 | +| 16 | 0.046 | ±0.3% | 4.96482e+08 | 0.035 | ±0.2% | 4.51017e+08 | 1.314 | 10.081 | +| 32 | 0.088 | ±0.1% | 6.23903e+08 | 0.067 | ±0.1% | 5.32974e+08 | 1.33 | 17.061 | + ## Resources A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Data2Vec. diff --git a/docs/source/en/model_doc/deformable_detr.md b/docs/source/en/model_doc/deformable_detr.md index 82ef251d478b95..5ed99dfe81d1c0 100644 --- a/docs/source/en/model_doc/deformable_detr.md +++ b/docs/source/en/model_doc/deformable_detr.md @@ -54,6 +54,12 @@ If you're interested in submitting a resource to be included here, please feel f - preprocess - post_process_object_detection +## DeformableDetrImageProcessorFast + +[[autodoc]] DeformableDetrImageProcessorFast + - preprocess + - post_process_object_detection + ## DeformableDetrFeatureExtractor [[autodoc]] DeformableDetrFeatureExtractor diff --git a/docs/source/en/model_doc/falcon3.md b/docs/source/en/model_doc/falcon3.md new file mode 100644 index 00000000000000..813533dd7f4d0a --- /dev/null +++ b/docs/source/en/model_doc/falcon3.md @@ -0,0 +1,29 @@ + + +# Falcon3 + +## Overview + +Falcon3 represents a natural evolution from previous releases, emphasizing expanding the models' science, math, and code capabilities. This iteration includes five base models: Falcon3-1B-Base, Falcon3-3B-Base, Falcon3-Mamba-7B-Base, Falcon3-7B-Base, and Falcon3-10B-Base. In developing these models, we incorporated several key innovations aimed at improving the models' performances while reducing training costs: + +One pre-training: We conducted a single large-scale pretraining run on the 7B model, using 2048 H100 GPU chips, leveraging 14 trillion tokens featuring web, code, STEM, and curated high-quality and multilingual data. +Depth up-scaling for improved reasoning: Building on recent studies on the effects of model depth, we upscaled the 7B model to a 10B parameters model by duplicating the redundant layers and continuing pre-training with 2TT of high-quality data. This yielded Falcon3-10B-Base which achieves state-of-the-art zero-shot and few-shot performance for models under 13B parameters. +Knowledge distillation for better tiny models: To provide compact and efficient alternatives, we developed Falcon3-1B-Base and Falcon3-3B-Base by leveraging pruning and knowledge distillation techniques, using less than 100GT of curated high-quality data, thereby redefining pre-training efficiency. + +## Resources +- [Blog post](https://huggingface.co/blog/falcon3) +- [Models on Huggingface](https://huggingface.co/collections/tiiuae/falcon3-67605ae03578be86e4e87026) diff --git a/docs/source/en/model_doc/idefics2.md b/docs/source/en/model_doc/idefics2.md index 5ad56b7b5c525d..b9b51082f29e5b 100644 --- a/docs/source/en/model_doc/idefics2.md +++ b/docs/source/en/model_doc/idefics2.md @@ -141,7 +141,7 @@ Do note that when training Idefics2 on multi-turn conversations between a user a ## Model optimizations: Flash Attention -The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. +The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature. diff --git a/docs/source/en/model_doc/idefics3.md b/docs/source/en/model_doc/idefics3.md index dfaf40477a7b52..cf7c043e928901 100644 --- a/docs/source/en/model_doc/idefics3.md +++ b/docs/source/en/model_doc/idefics3.md @@ -51,6 +51,13 @@ This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) [[autodoc]] Idefics3Config +## Idefics3VisionConfig + +[[autodoc]] Idefics3VisionConfig + +## Idefics3VisionTransformer + +[[autodoc]] Idefics3VisionTransformer ## Idefics3Model diff --git a/docs/source/en/model_doc/ijepa.md b/docs/source/en/model_doc/ijepa.md new file mode 100644 index 00000000000000..cb2afd25e20bca --- /dev/null +++ b/docs/source/en/model_doc/ijepa.md @@ -0,0 +1,92 @@ + + +# I-JEPA + +## Overview + +The I-JEPA model was proposed in [Image-based Joint-Embedding Predictive Architecture](https://arxiv.org/abs/2301.08243) by Mahmoud Assran, Quentin Duval, Ishan Misra, Piotr Bojanowski, Pascal Vincent, Michael Rabbat, Yann LeCun, Nicolas Ballas. +I-JEPA is a self-supervised learning method that predicts the representations of one part of an image based on other parts of the same image. This approach focuses on learning semantic features without relying on pre-defined invariances from hand-crafted data transformations, which can bias specific tasks, or on filling in pixel-level details, which often leads to less meaningful representations. + +The abstract from the paper is the following: + +This paper demonstrates an approach for learning highly semantic image representations without relying on hand-crafted data-augmentations. We introduce the Image- based Joint-Embedding Predictive Architecture (I-JEPA), a non-generative approach for self-supervised learning from images. The idea behind I-JEPA is simple: from a single context block, predict the representations of various target blocks in the same image. A core design choice to guide I-JEPA towards producing semantic representations is the masking strategy; specifically, it is crucial to (a) sample tar- get blocks with sufficiently large scale (semantic), and to (b) use a sufficiently informative (spatially distributed) context block. Empirically, when combined with Vision Transform- ers, we find I-JEPA to be highly scalable. For instance, we train a ViT-Huge/14 on ImageNet using 16 A100 GPUs in under 72 hours to achieve strong downstream performance across a wide range of tasks, from linear classification to object counting and depth prediction. + + + + I-JEPA architecture. Taken from the original paper. + +This model was contributed by [jmtzt](https://huggingface.co/jmtzt). +The original code can be found [here](https://github.com/facebookresearch/ijepa). + +## How to use + +Here is how to use this model for image feature extraction: + +```python +import requests +import torch +from PIL import Image +from torch.nn.functional import cosine_similarity + +from transformers import AutoModel, AutoProcessor + +url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg" +url_2 = "http://images.cocodataset.org/val2017/000000219578.jpg" +image_1 = Image.open(requests.get(url_1, stream=True).raw) +image_2 = Image.open(requests.get(url_2, stream=True).raw) + +model_id = "facebook/ijepa_vith14_1k" +processor = AutoProcessor.from_pretrained(model_id) +model = AutoModel.from_pretrained(model_id) + +@torch.no_grad() +def infer(image): + inputs = processor(image, return_tensors="pt") + outputs = model(**inputs) + return outputs.last_hidden_state.mean(dim=1) + + +embed_1 = infer(image_1) +embed_2 = infer(image_2) + +similarity = cosine_similarity(embed_1, embed_2) +print(similarity) +``` + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with I-JEPA. + + + +- [`IJepaForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb). +- See also: [Image classification task guide](../tasks/image_classification) + +## IJepaConfig + +[[autodoc]] IJepaConfig + +## IJepaModel + +[[autodoc]] IJepaModel + - forward + +## IJepaForImageClassification + +[[autodoc]] IJepaForImageClassification + - forward \ No newline at end of file diff --git a/docs/source/en/model_doc/instructblip.md b/docs/source/en/model_doc/instructblip.md index b5fc634b621626..904a96bc786f07 100644 --- a/docs/source/en/model_doc/instructblip.md +++ b/docs/source/en/model_doc/instructblip.md @@ -33,6 +33,10 @@ The original code can be found [here](https://github.com/salesforce/LAVIS/tree/m InstructBLIP uses the same architecture as [BLIP-2](blip2) with a tiny but important difference: it also feeds the text prompt (instruction) to the Q-Former. +> [!NOTE] +> BLIP models after release v4.46 will raise warnings about adding `processor.num_query_tokens = {{num_query_tokens}}` and expand model embeddings layer to add special `` token. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. Adding these attributes means that BLIP will add the number of query tokens required per image and expand the text with as many `` placeholders as there will be query tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there wil be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.num_query_tokens` and model embeddings expansion can be done by following [this link](https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042). + ## InstructBlipConfig [[autodoc]] InstructBlipConfig diff --git a/docs/source/en/model_doc/instructblipvideo.md b/docs/source/en/model_doc/instructblipvideo.md index aa93feb6b6dced..8b2207ce176566 100644 --- a/docs/source/en/model_doc/instructblipvideo.md +++ b/docs/source/en/model_doc/instructblipvideo.md @@ -35,6 +35,10 @@ The original code can be found [here](https://github.com/salesforce/LAVIS/tree/m - The model was trained by sampling 4 frames per video, so it's recommended to sample 4 frames +> [!NOTE] +> BLIP models after release v4.46 will raise warnings about adding `processor.num_query_tokens = {{num_query_tokens}}` and expand model embeddings layer to add special `` token. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. Adding these attributes means that BLIP will add the number of query tokens required per image and expand the text with as many `` placeholders as there will be query tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there wil be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.num_query_tokens` and model embeddings expansion can be done by following [this link](https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042). + ## InstructBlipVideoConfig [[autodoc]] InstructBlipVideoConfig diff --git a/docs/source/en/model_doc/llava.md b/docs/source/en/model_doc/llava.md index 99950a2ffd8e93..e883572995e924 100644 --- a/docs/source/en/model_doc/llava.md +++ b/docs/source/en/model_doc/llava.md @@ -40,6 +40,13 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/ - Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results. + +> [!NOTE] +> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. +Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches. + + ### Single image inference For best results, we recommend users to use the processor's `apply_chat_template()` method to format your prompt correctly. For that you need to construct a conversation history, passing in a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities, as follows: @@ -85,10 +92,10 @@ LLaVa also supports batched inference. Here is how you can do it: import requests from PIL import Image import torch -from transformers import AutoProcessor, LLavaForConditionalGeneration +from transformers import AutoProcessor, LlavaForConditionalGeneration # Load the model in half-precision -model = LLavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", torch_dtype=torch.float16, device_map="auto") +model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", torch_dtype=torch.float16, device_map="auto") processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") # Get two different images @@ -124,7 +131,7 @@ prompt_2 = processor.apply_chat_template(conversation_2, add_generation_prompt=T prompts = [prompt_1, prompt_2] # We can simply feed images in the order they have to be used in the text prompt -inputs = processor(images=[image_stop, image_cats, image_snowman], text=prompts, padding=True, return_tensors="pt").to(model.device, torch.float16) +inputs = processor(images=[image_stop, image_cats], text=prompts, padding=True, return_tensors="pt").to(model.device, torch.float16) # Generate generate_ids = model.generate(**inputs, max_new_tokens=30) diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md index b9146fbd33478a..88bd63e7101f17 100644 --- a/docs/source/en/model_doc/llava_next.md +++ b/docs/source/en/model_doc/llava_next.md @@ -53,6 +53,12 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/
+> [!NOTE] +> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. +Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches. + + - Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use the processor's `apply_chat_template` to format your prompts correctly. For that you have to construct a conversation history, passing a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities. Below is an example of how to do that and the list of formats accepted by each checkpoint. We will use [llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) and a conversation history of text and image. Each content field has to be a list of dicts, as follows: diff --git a/docs/source/en/model_doc/llava_next_video.md b/docs/source/en/model_doc/llava_next_video.md index fe905dfb7932ab..cc3a61aae6c736 100644 --- a/docs/source/en/model_doc/llava_next_video.md +++ b/docs/source/en/model_doc/llava_next_video.md @@ -50,6 +50,12 @@ The original code can be found [here](https://github.com/LLaVA-VL/LLaVA-NeXT/tre +> [!NOTE] +> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. +Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches. + + - Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use tokenizer's `apply_chat_template` to format your prompts correctly. Below is an example of how to do that. We will use [LLaVA-NeXT-Video-7B-hf](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf) and a conversation history of videos and images. Each content field has to be a list of dicts, as follows: @@ -234,7 +240,7 @@ model = LlavaNextVideoForConditionalGeneration.from_pretrained("llava-hf/LLaVA-N ### Flash-Attention 2 to speed-up generation -Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. +Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. First, make sure to install the latest version of Flash Attention 2: diff --git a/docs/source/en/model_doc/mistral.md b/docs/source/en/model_doc/mistral.md index 2be657109a8d46..cfa2af3678137a 100644 --- a/docs/source/en/model_doc/mistral.md +++ b/docs/source/en/model_doc/mistral.md @@ -91,7 +91,7 @@ As can be seen, the instruction-tuned model requires a [chat template](../chat_t ## Speeding up Mistral by using Flash Attention -The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. +The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature. diff --git a/docs/source/en/model_doc/mixtral.md b/docs/source/en/model_doc/mixtral.md index 7afcaa798ecac4..b5451702e44a16 100644 --- a/docs/source/en/model_doc/mixtral.md +++ b/docs/source/en/model_doc/mixtral.md @@ -93,7 +93,7 @@ As can be seen, the instruction-tuned model requires a [chat template](../chat_t ## Speeding up Mixtral by using Flash Attention -The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. +The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature. diff --git a/docs/source/en/model_doc/modernbert.md b/docs/source/en/model_doc/modernbert.md new file mode 100644 index 00000000000000..b641d7f3f58199 --- /dev/null +++ b/docs/source/en/model_doc/modernbert.md @@ -0,0 +1,95 @@ + + +# ModernBert + +
+ +Models + + +Paper page + +
+ +## Overview + +The ModernBert model was proposed in [Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder for Fast, Memory Efficient, and Long Context Finetuning and Inference](https://arxiv.org/abs/2412.13663) by Benjamin Warner, Antoine Chaffin, Benjamin Clavié, Orion Weller, Oskar Hallström, Said Taghadouini, Alexis Galalgher, Raja Bisas, Faisal Ladhak, Tom Aarsen, Nathan Cooper, Grifin Adams, Jeremy Howard and Iacopo Poli. + +It is a refresh of the traditional encoder architecture, as used in previous models such as [BERT](https://huggingface.co/docs/transformers/en/model_doc/bert) and [RoBERTa](https://huggingface.co/docs/transformers/en/model_doc/roberta). + +It builds on BERT and implements many modern architectural improvements which have been developed since its original release, such as: +- [Rotary Positional Embeddings](https://huggingface.co/blog/designing-positional-encoding) to support sequences of up to 8192 tokens. +- [Unpadding](https://arxiv.org/abs/2208.08124) to ensure no compute is wasted on padding tokens, speeding up processing time for batches with mixed-length sequences. +- [GeGLU](https://arxiv.org/abs/2002.05202) Replacing the original MLP layers with GeGLU layers, shown to improve performance. +- [Alternating Attention](https://arxiv.org/abs/2004.05150v2) where most attention layers employ a sliding window of 128 tokens, with Global Attention only used every 3 layers. +- [Flash Attention](https://github.com/Dao-AILab/flash-attention) to speed up processing. +- A model designed following recent [The Case for Co-Designing Model Architectures with Hardware](https://arxiv.org/abs/2401.14489), ensuring maximum efficiency across inference GPUs. +- Modern training data scales (2 trillion tokens) and mixtures (including code ande math data) + +The abstract from the paper is the following: + +*Encoder-only transformer models such as BERT offer a great performance-size tradeoff for retrieval and classification tasks with respect to larger decoder-only models. Despite being the workhorse of numerous production pipelines, there have been limited Pareto improvements to BERT since its release. In this paper, we introduce ModernBERT, bringing modern model optimizations to encoder-only models and representing a major Pareto improvement over older encoders. Trained on 2 trillion tokens with a native 8192 sequence length, ModernBERT models exhibit state-of-the-art results on a large pool of evaluations encompassing diverse classification tasks and both single and multi-vector retrieval on different domains (including code). In addition to strong downstream performance, ModernBERT is also the most speed and memory efficient encoder and is designed for inference on common GPUs.* + +The original code can be found [here](https://github.com/answerdotai/modernbert). + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ModernBert. + + + +- A notebook on how to [finetune for General Language Understanding Evaluation (GLUE) with Transformers](https://github.com/AnswerDotAI/ModernBERT/blob/main/examples/finetune_modernbert_on_glue.ipynb), also available as a Google Colab [notebook](https://colab.research.google.com/github/AnswerDotAI/ModernBERT/blob/main/examples/finetune_modernbert_on_glue.ipynb). 🌎 + + + +- A script on how to [finetune for text similarity or information retrieval with Sentence Transformers](https://github.com/AnswerDotAI/ModernBERT/blob/main/examples/train_st.py). 🌎 +- A script on how to [finetune for information retrieval with PyLate](https://github.com/AnswerDotAI/ModernBERT/blob/main/examples/train_pylate.py). 🌎 + + + +- [Masked language modeling task guide](../tasks/masked_language_modeling) + + +## ModernBertConfig + +[[autodoc]] ModernBertConfig + + + + +## ModernBertModel + +[[autodoc]] ModernBertModel + - forward + +## ModernBertForMaskedLM + +[[autodoc]] ModernBertForMaskedLM + - forward + +## ModernBertForSequenceClassification + +[[autodoc]] ModernBertForSequenceClassification + - forward + +## ModernBertForTokenClassification + +[[autodoc]] ModernBertForTokenClassification + - forward + + + diff --git a/docs/source/en/model_doc/olmo2.md b/docs/source/en/model_doc/olmo2.md new file mode 100644 index 00000000000000..8ca3326660b3f4 --- /dev/null +++ b/docs/source/en/model_doc/olmo2.md @@ -0,0 +1,46 @@ + + +# OLMo2 + +## Overview + +The OLMo2 model is the successor of the OLMo model, which was proposed in +[OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838). + + The architectural changes from the original OLMo model to this model are: + +- RMSNorm is used instead of standard layer norm. +- Norm is applied to attention queries and keys. +- Norm is applied after attention/feedforward layers rather than before. + +This model was contributed by [shanearora](https://huggingface.co/shanearora). +The original code can be found [here](https://github.com/allenai/OLMo/tree/main/olmo). + + +## Olmo2Config + +[[autodoc]] Olmo2Config + +## Olmo2Model + +[[autodoc]] Olmo2Model + - forward + +## Olmo2ForCausalLM + +[[autodoc]] Olmo2ForCausalLM + - forward diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md index ab604e4521fc73..62bdc004c51718 100644 --- a/docs/source/en/model_doc/pixtral.md +++ b/docs/source/en/model_doc/pixtral.md @@ -88,6 +88,11 @@ output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up [[autodoc]] PixtralImageProcessor - preprocess +## PixtralImageProcessorFast + +[[autodoc]] PixtralImageProcessorFast + - preprocess + ## PixtralProcessor [[autodoc]] PixtralProcessor diff --git a/docs/source/en/model_doc/rt_detr.md b/docs/source/en/model_doc/rt_detr.md index 8ad220dc4bd113..6a1545e123297c 100644 --- a/docs/source/en/model_doc/rt_detr.md +++ b/docs/source/en/model_doc/rt_detr.md @@ -57,7 +57,7 @@ Initially, an image is processed using a pre-trained convolutional neural networ >>> with torch.no_grad(): ... outputs = model(**inputs) ->>> results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.3) +>>> results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3) >>> for result in results: ... for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]): diff --git a/docs/source/en/model_doc/timm_wrapper.md b/docs/source/en/model_doc/timm_wrapper.md new file mode 100644 index 00000000000000..5af3d51746c325 --- /dev/null +++ b/docs/source/en/model_doc/timm_wrapper.md @@ -0,0 +1,67 @@ + + +# TimmWrapper + +## Overview + +Helper class to enable loading timm models to be used with the transformers library and its autoclasses. + +```python +>>> import torch +>>> from PIL import Image +>>> from urllib.request import urlopen +>>> from transformers import AutoModelForImageClassification, AutoImageProcessor + +>>> # Load image +>>> image = Image.open(urlopen( +... 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png' +... )) + +>>> # Load model and image processor +>>> checkpoint = "timm/resnet50.a1_in1k" +>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint) +>>> model = AutoModelForImageClassification.from_pretrained(checkpoint).eval() + +>>> # Preprocess image +>>> inputs = image_processor(image) + +>>> # Forward pass +>>> with torch.no_grad(): +... logits = model(**inputs).logits + +>>> # Get top 5 predictions +>>> top5_probabilities, top5_class_indices = torch.topk(logits.softmax(dim=1) * 100, k=5) +``` + +## TimmWrapperConfig + +[[autodoc]] TimmWrapperConfig + +## TimmWrapperImageProcessor + +[[autodoc]] TimmWrapperImageProcessor + - preprocess + +## TimmWrapperModel + +[[autodoc]] TimmWrapperModel + - forward + +## TimmWrapperForImageClassification + +[[autodoc]] TimmWrapperForImageClassification + - forward diff --git a/docs/source/en/model_doc/video_llava.md b/docs/source/en/model_doc/video_llava.md index 1c4b5b4b874dd7..a3ba1258ecfa06 100644 --- a/docs/source/en/model_doc/video_llava.md +++ b/docs/source/en/model_doc/video_llava.md @@ -54,6 +54,12 @@ This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanT The original code can be found [here](https://github.com/PKU-YuanGroup/Video-LLaVA). +> [!NOTE] +> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. +Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches. + + ## Usage example ### Single Media Mode @@ -168,7 +174,7 @@ model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-L ### Flash-Attention 2 to speed-up generation -Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. +Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. First, make sure to install the latest version of Flash Attention 2: diff --git a/docs/source/en/model_doc/vipllava.md b/docs/source/en/model_doc/vipllava.md index b3e76cd292e40a..cb625e3711615b 100644 --- a/docs/source/en/model_doc/vipllava.md +++ b/docs/source/en/model_doc/vipllava.md @@ -39,6 +39,12 @@ This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) - Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results. +> [!NOTE] +> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. +Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings. +The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches. + + - For better results, we recommend users to use the processor's `apply_chat_template()` method to format your prompt correctly. For that you need to construct a conversation history, passing in a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities, as follows: ```python @@ -52,7 +58,7 @@ conversation = [ "content": [ {"type": "image"}, {"type": "text", "text": "What’s shown in this image?"}, - , + ], }, { "role": "assistant", diff --git a/docs/source/en/model_sharing.md b/docs/source/en/model_sharing.md index ec5802cfee372e..076fc2ccdd571a 100644 --- a/docs/source/en/model_sharing.md +++ b/docs/source/en/model_sharing.md @@ -43,7 +43,7 @@ As a result, you can load a specific model version with the `revision` parameter ```py >>> model = AutoModel.from_pretrained( -... "julien-c/EsperBERTo-small", revision="v2.0.1" # tag name, or branch name, or commit hash +... "julien-c/EsperBERTo-small", revision="4c77982" # tag name, or branch name, or commit hash ... ) ``` diff --git a/docs/source/en/modular_transformers.md b/docs/source/en/modular_transformers.md index 1516233ec4d6e1..8eebbf347c11c3 100644 --- a/docs/source/en/modular_transformers.md +++ b/docs/source/en/modular_transformers.md @@ -22,6 +22,9 @@ etc. Model contribution PRs rarely add less than 3-5k lines of code, with much o This raises the bar for contributions, and with Modular Transformers, we're aiming to lower the bar to a much more acceptable point. +If you plan to add a model to `transformers` make sure you read [How to add a model to 🤗 Transformers?](https://huggingface.co/docs/transformers/add_new_model). +For any kind of contributions, see [CONTRIBUTING.md](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md). + ## What is it? Modular Transformers introduces the concept of a "modular" file to a model folder. This modular file accepts code @@ -43,6 +46,12 @@ be moved to the new Modular Transformers format in the coming months. ### Details +To generate a single file from the modular file, run the following command. + +```bash +python utils/modular_model_converter.py --files-to-parse src/transformers/models//modular_.py +``` + The "linter", which unravels the inheritance and creates all single-files from the modular file, will flatten the inheritance while trying to be invisible to Python users. At this time, the linter flattens a **single** level of inheritance. @@ -59,7 +68,11 @@ file, and the corresponding files will be created for you. ### Enforcement -[TODO] We are introducing a new test, that makes sure the generated content matches what is present in the `modular_xxxx.py` +Run the command below to ensure the generated content matches `modular_.py` + +```bash +python utils/check_modular_conversion.py --files src/transformers/models//modular_.py +``` ### Examples @@ -194,4 +207,4 @@ We now also support special cases like class GemmaVisionModel(CLIPModel): pass ``` -where the name of your class `GemmaVision` is not the same as the modular `Gemma`. This is super useful for composite models. \ No newline at end of file +where the name of your class `GemmaVision` is not the same as the modular `Gemma`. This is super useful for composite models. diff --git a/docs/source/en/perf_infer_cpu.md b/docs/source/en/perf_infer_cpu.md index c0e017c020870e..7f8b525b3df610 100644 --- a/docs/source/en/perf_infer_cpu.md +++ b/docs/source/en/perf_infer_cpu.md @@ -41,8 +41,7 @@ Enable BetterTransformer with the [`PreTrainedModel.to_bettertransformer`] metho ```py from transformers import AutoModelForCausalLM -model = AutoModelForCausalLM.from_pretrained("bigcode/starcoder") -model.to_bettertransformer() +model = AutoModelForCausalLM.from_pretrained("bigcode/starcoder", torch_dtype="auto") ``` ## TorchScript @@ -54,7 +53,7 @@ For a gentle introduction to TorchScript, see the [Introduction to PyTorch Torch With the [`Trainer`] class, you can enable JIT mode for CPU inference by setting the `--jit_mode_eval` flag: ```bash -python run_qa.py \ +python examples/pytorch/question-answering/run_qa.py \ --model_name_or_path csarron/bert-base-uncased-squad-v1 \ --dataset_name squad \ --do_eval \ @@ -86,7 +85,7 @@ pip install intel_extension_for_pytorch Set the `--use_ipex` and `--jit_mode_eval` flags in the [`Trainer`] class to enable JIT mode with the graph optimizations: ```bash -python run_qa.py \ +python examples/pytorch/question-answering/run_qa.py \ --model_name_or_path csarron/bert-base-uncased-squad-v1 \ --dataset_name squad \ --do_eval \ diff --git a/docs/source/en/perf_infer_gpu_multi.md b/docs/source/en/perf_infer_gpu_multi.md new file mode 100644 index 00000000000000..ea9421747c13df --- /dev/null +++ b/docs/source/en/perf_infer_gpu_multi.md @@ -0,0 +1,68 @@ + + +# Multi-GPU inference + +Built-in Tensor Parallelism (TP) is now available with certain models using PyTorch. Tensor parallelism shards a model onto multiple GPUs, enabling larger model sizes, and parallelizes computations such as matrix multiplication. + +To enable tensor parallel, pass the argument `tp_plan="auto"` to [`~AutoModelForCausalLM.from_pretrained`]: + +```python +import os +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" + +# Initialize distributed +rank = int(os.environ["RANK"]) +device = torch.device(f"cuda:{rank}") +torch.distributed.init_process_group("nccl", device_id=device) + +# Retrieve tensor parallel model +model = AutoModelForCausalLM.from_pretrained( + model_id, + tp_plan="auto", +) + +# Prepare input tokens +tokenizer = AutoTokenizer.from_pretrained(model_id) +prompt = "Can I help" +inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(device) + +# Distributed run +outputs = model(inputs) +``` + +You can use `torchrun` to launch the above script with multiple processes, each mapping to a GPU: + +``` +torchrun --nproc-per-node 4 demo.py +``` + +PyTorch tensor parallel is currently supported for the following models: +* [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel) + +You can request to add tensor parallel support for another model by opening a GitHub Issue or Pull Request. + +### Expected speedups + +You can benefit from considerable speedups for inference, especially for inputs with large batch size or long sequences. + +For a single forward pass on [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel) with a sequence length of 512 and various batch sizes, the expected speedup is as follows: + +
+ +
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 67bd31fdaeede5..930f41b6fefba7 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -37,11 +37,14 @@ FlashAttention-2 is experimental and may change considerably in future versions. 2. partitioning the work between GPU threads to reduce communication and shared memory reads/writes between them FlashAttention-2 is currently supported for the following architectures: +* [Aria](https://huggingface.co/docs/transformers/model_doc/aria#transformers.AriaForConditionalGeneration) * [Bark](https://huggingface.co/docs/transformers/model_doc/bark#transformers.BarkModel) +* [Bamba](https://huggingface.co/docs/transformers/model_doc/bamba#transformers.BambaModel) * [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel) * [Chameleon](https://huggingface.co/docs/transformers/model_doc/chameleon#transformers.Chameleon) * [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPModel) * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel) +* [Cohere2](https://huggingface.co/docs/transformers/model_doc/cohere2#transformers.Cohere2Model) * [GLM](https://huggingface.co/docs/transformers/model_doc/glm#transformers.GLMModel) * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel) * [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel) @@ -71,12 +74,14 @@ FlashAttention-2 is currently supported for the following architectures: * [MBart](https://huggingface.co/docs/transformers/model_doc/mbart#transformers.MBartModel) * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel) * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel) +* [ModernBert](https://huggingface.co/docs/transformers/model_doc/modernbert#transformers.ModernBert) * [Moshi](https://huggingface.co/docs/transformers/model_doc/moshi#transformers.MoshiModel) * [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel) * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel) * [Nemotron](https://huggingface.co/docs/transformers/model_doc/nemotron) * [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb) * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel) +* [OLMo2](https://huggingface.co/docs/transformers/model_doc/olmo2#transformers.Olmo2Model) * [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel) * [OPT](https://huggingface.co/docs/transformers/model_doc/opt#transformers.OPTModel) * [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration) @@ -215,8 +220,11 @@ PyTorch's [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.o For now, Transformers supports SDPA inference and training for the following architectures: * [Albert](https://huggingface.co/docs/transformers/model_doc/albert#transformers.AlbertModel) +* [Aria](https://huggingface.co/docs/transformers/model_doc/aria#transformers.AriaForConditionalGeneration) * [Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer#transformers.ASTModel) +* [Bamba](https://huggingface.co/docs/transformers/model_doc/bamba#transformers.BambaModel) * [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel) +* [Beit](https://huggingface.co/docs/transformers/model_doc/beit#transformers.BeitModel) * [Bert](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel) * [BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt#transformers.BioGptModel) * [CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert#transformers.CamembertModel) @@ -224,7 +232,9 @@ For now, Transformers supports SDPA inference and training for the following arc * [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPModel) * [GLM](https://huggingface.co/docs/transformers/model_doc/glm#transformers.GLMModel) * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel) +* [Cohere2](https://huggingface.co/docs/transformers/model_doc/cohere2#transformers.Cohere2Model) * [data2vec_audio](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecAudioModel) +* [data2vec_vision](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecVisionModel) * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel) * [DeiT](https://huggingface.co/docs/transformers/model_doc/deit#transformers.DeiTModel) * [Dinov2](https://huggingface.co/docs/transformers/en/model_doc/dinov2) @@ -234,6 +244,7 @@ For now, Transformers supports SDPA inference and training for the following arc * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel) * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel) * [Gemma2](https://huggingface.co/docs/transformers/model_doc/gemma2#transformers.Gemma2Model) +* [Granite](https://huggingface.co/docs/transformers/model_doc/granite#transformers.GraniteModel) * [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2) * [GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode#transformers.GPTBigCodeModel) * [GPTNeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox#transformers.GPTNeoXModel) @@ -241,7 +252,7 @@ For now, Transformers supports SDPA inference and training for the following arc * [Idefics](https://huggingface.co/docs/transformers/model_doc/idefics#transformers.IdeficsModel) * [Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2#transformers.Idefics2Model) * [Idefics3](https://huggingface.co/docs/transformers/model_doc/idefics3#transformers.Idefics3Model) -* [Granite](https://huggingface.co/docs/transformers/model_doc/granite#transformers.GraniteModel) +* [I-JEPA](https://huggingface.co/docs/transformers/model_doc/ijepa#transformers.IJepaModel) * [GraniteMoe](https://huggingface.co/docs/transformers/model_doc/granitemoe#transformers.GraniteMoeModel) * [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel) * [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel) @@ -255,11 +266,13 @@ For now, Transformers supports SDPA inference and training for the following arc * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel) * [Mllama](https://huggingface.co/docs/transformers/model_doc/mllama#transformers.MllamaForConditionalGeneration) * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel) +* [ModernBert](https://huggingface.co/docs/transformers/model_doc/modernbert#transformers.ModernBert) * [Moshi](https://huggingface.co/docs/transformers/model_doc/moshi#transformers.MoshiModel) * [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel) * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel) * [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb) * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel) +* [OLMo2](https://huggingface.co/docs/transformers/model_doc/olmo2#transformers.Olmo2Model) * [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel) * [OPT](https://huggingface.co/docs/transformers/en/model_doc/opt) * [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration) @@ -403,7 +416,7 @@ To load a model in 4-bit for inference, use the `load_in_4bit` parameter. The `d from transformers import AutoModelForCausalLM model_name = "bigscience/bloom-2b5" -model_4bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True) +model_4bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", load_in_4bit=True) ``` To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 600MB of memory to the first GPU and 1GB of memory to the second GPU: @@ -412,7 +425,7 @@ To load a model in 4-bit for inference with multiple GPUs, you can control how m max_memory_mapping = {0: "600MB", 1: "1GB"} model_name = "bigscience/bloom-3b" model_4bit = AutoModelForCausalLM.from_pretrained( - model_name, device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping + model_name, torch_dtype="auto", device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping ) ``` @@ -430,7 +443,7 @@ To load a model in 8-bit for inference, use the `load_in_8bit` parameter. The `d from transformers import AutoModelForCausalLM, BitsAndBytesConfig model_name = "bigscience/bloom-2b5" -model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", quantization_config=BitsAndBytesConfig(load_in_8bit=True)) ``` If you're loading a model in 8-bit for text generation, you should use the [`~transformers.GenerationMixin.generate`] method instead of the [`Pipeline`] function which is not optimized for 8-bit models and will be slower. Some sampling strategies, like nucleus sampling, are also not supported by the [`Pipeline`] for 8-bit models. You should also place all inputs on the same device as the model: @@ -440,7 +453,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig model_name = "bigscience/bloom-2b5" tokenizer = AutoTokenizer.from_pretrained(model_name) -model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", quantization_config=BitsAndBytesConfig(load_in_8bit=True)) prompt = "Hello, my llama is cute" inputs = tokenizer(prompt, return_tensors="pt").to("cuda") @@ -454,7 +467,7 @@ To load a model in 4-bit for inference with multiple GPUs, you can control how m max_memory_mapping = {0: "1GB", 1: "2GB"} model_name = "bigscience/bloom-3b" model_8bit = AutoModelForCausalLM.from_pretrained( - model_name, device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping + model_name, torch_dtype="auto", device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping ) ``` @@ -513,7 +526,7 @@ quantization_config = BitsAndBytesConfig( ) tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") -model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", quantization_config=quantization_config) +model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype="auto", quantization_config=quantization_config) # enable BetterTransformer model = model.to_bettertransformer() diff --git a/docs/source/en/perf_torch_compile.md b/docs/source/en/perf_torch_compile.md index acc424930b1c4e..2155a403b2b77f 100644 --- a/docs/source/en/perf_torch_compile.md +++ b/docs/source/en/perf_torch_compile.md @@ -27,7 +27,7 @@ To compile any computer vision model of your choice, call `torch.compile()` on t ```diff from transformers import AutoModelForImageClassification -model = AutoModelForImageClassification.from_pretrained(MODEL_ID).to("cuda") +model = AutoModelForImageClassification.from_pretrained(MODEL_ID).to(DEVICE) + model = torch.compile(model) ``` @@ -47,15 +47,17 @@ from PIL import Image import requests import numpy as np from transformers import AutoImageProcessor, AutoModelForImageClassification +from accelerate.test_utils.testing import get_backend +device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) url = 'http://images.cocodataset.org/val2017/000000039769.jpg' image = Image.open(requests.get(url, stream=True).raw) processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") -model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224").to("cuda") +model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224").to(device) model = torch.compile(model) -processed_input = processor(image, return_tensors='pt').to(device="cuda") +processed_input = processor(image, return_tensors='pt').to(device) with torch.no_grad(): _ = model(**processed_input) @@ -66,13 +68,15 @@ with torch.no_grad(): ```python from transformers import AutoImageProcessor, AutoModelForObjectDetection +from accelerate.test_utils.testing import get_backend +device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50") -model = AutoModelForObjectDetection.from_pretrained("facebook/detr-resnet-50").to("cuda") +model = AutoModelForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(device) model = torch.compile(model) texts = ["a photo of a cat", "a photo of a dog"] -inputs = processor(text=texts, images=image, return_tensors="pt").to("cuda") +inputs = processor(text=texts, images=image, return_tensors="pt").to(device) with torch.no_grad(): _ = model(**inputs) @@ -82,11 +86,13 @@ with torch.no_grad(): ```python from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation +from accelerate.test_utils.testing import get_backend +device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) processor = SegformerImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") -model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512").to("cuda") +model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512").to(device) model = torch.compile(model) -seg_inputs = processor(images=image, return_tensors="pt").to("cuda") +seg_inputs = processor(images=image, return_tensors="pt").to(device) with torch.no_grad(): _ = model(**seg_inputs) diff --git a/docs/source/en/perf_train_cpu.md b/docs/source/en/perf_train_cpu.md index 7ef98932d537ac..ab2f735ecbdd50 100644 --- a/docs/source/en/perf_train_cpu.md +++ b/docs/source/en/perf_train_cpu.md @@ -51,7 +51,7 @@ To enable auto mixed precision with IPEX in Trainer, users should add `use_ipex` Take an example of the use cases on [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) - Training with IPEX using BF16 auto mixed precision on CPU: -
 python run_qa.py \
+
 python examples/pytorch/question-answering/run_qa.py \
 --model_name_or_path google-bert/bert-base-uncased \
 --dataset_name squad \
 --do_train \
diff --git a/docs/source/en/perf_train_cpu_many.md b/docs/source/en/perf_train_cpu_many.md
index ed782caca3b1f1..d6a029c471de08 100644
--- a/docs/source/en/perf_train_cpu_many.md
+++ b/docs/source/en/perf_train_cpu_many.md
@@ -75,7 +75,7 @@ The following command enables training with 2 processes on one Xeon node, with o
  export CCL_WORKER_COUNT=1
  export MASTER_ADDR=127.0.0.1
  mpirun -n 2 -genv OMP_NUM_THREADS=23 \
- python3 run_qa.py \
+ python3 examples/pytorch/question-answering/run_qa.py \
  --model_name_or_path google-bert/bert-large-uncased \
  --dataset_name squad \
  --do_train \
@@ -104,7 +104,7 @@ Now, run the following command in node0 and **4DDP** will be enabled in node0 an
  export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip
  mpirun -f hostfile -n 4 -ppn 2 \
  -genv OMP_NUM_THREADS=23 \
- python3 run_qa.py \
+ python3 examples/pytorch/question-answering/run_qa.py \
  --model_name_or_path google-bert/bert-large-uncased \
  --dataset_name squad \
  --do_train \
diff --git a/docs/source/en/perf_train_gpu_many.md b/docs/source/en/perf_train_gpu_many.md
index 858da99e7bc388..c810a18470a09e 100644
--- a/docs/source/en/perf_train_gpu_many.md
+++ b/docs/source/en/perf_train_gpu_many.md
@@ -553,7 +553,7 @@ It performs a sort of 4D Parallelism over Sample-Operator-Attribute-Parameter.
 Examples:
 * Sample
 
-Let's take 10 batches of sequence length 512. If we parallelize them by sample dimension into 2 devices, we get 10 x 512 which becomes be 5 x 2 x 512.
+Let's take 10 batches of sequence length 512. If we parallelize them by sample dimension into 2 devices, we get 10 x 512 which becomes 5 x 2 x 512.
 
 * Operator
 
diff --git a/docs/source/en/performance.md b/docs/source/en/performance.md
index 94e756cf33ada6..b9176be04ec206 100644
--- a/docs/source/en/performance.md
+++ b/docs/source/en/performance.md
@@ -53,7 +53,7 @@ sections we go through the steps to run inference on CPU and single/multi-GPU se
 
 * [Inference on a single CPU](perf_infer_cpu)
 * [Inference on a single GPU](perf_infer_gpu_one)
-* [Multi-GPU inference](perf_infer_gpu_one)
+* [Multi-GPU inference](perf_infer_gpu_multi)
 * [XLA Integration for TensorFlow Models](tf_xla)
 
 
diff --git a/docs/source/en/perplexity.md b/docs/source/en/perplexity.md
index ac7ef8504e72b6..525f0d567bcb61 100644
--- a/docs/source/en/perplexity.md
+++ b/docs/source/en/perplexity.md
@@ -73,8 +73,9 @@ Let's demonstrate this process with GPT-2.
 
 ```python
 from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+from accelerate.test_utils.testing import get_backend
 
-device = "cuda"
+device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
 model_id = "openai-community/gpt2-large"
 model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
 tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
diff --git a/docs/source/en/pipeline_tutorial.md b/docs/source/en/pipeline_tutorial.md
index 3363c68ea417a3..357bc7f636ec25 100644
--- a/docs/source/en/pipeline_tutorial.md
+++ b/docs/source/en/pipeline_tutorial.md
@@ -59,10 +59,10 @@ Let's try the [Whisper large-v2](https://huggingface.co/openai/whisper-large-v2)
 benchmarks. It also has the added benefit of predicting punctuation and casing, neither of which are possible with  
 Wav2Vec2.
 
-Let's give it a try here to see how it performs:
+Let's give it a try here to see how it performs. Set `torch_dtype="auto"` to automatically load the most memory-efficient data type the weights are stored in.
 
 ```py
->>> transcriber = pipeline(model="openai/whisper-large-v2")
+>>> transcriber = pipeline(model="openai/whisper-large-v2", torch_dtype="auto")
 >>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
 {'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
 ```
diff --git a/docs/source/en/quantization/bitsandbytes.md b/docs/source/en/quantization/bitsandbytes.md
index e9447555e82449..6c6b92d0a6e594 100644
--- a/docs/source/en/quantization/bitsandbytes.md
+++ b/docs/source/en/quantization/bitsandbytes.md
@@ -64,7 +64,7 @@ model_8bit = AutoModelForCausalLM.from_pretrained(
 )
 ```
 
-By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want:
+By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want. Setting `torch_dtype="auto"` loads the model in the data type defined in a model's `config.json` file.
 
 ```py
 import torch
@@ -75,7 +75,7 @@ quantization_config = BitsAndBytesConfig(load_in_8bit=True)
 model_8bit = AutoModelForCausalLM.from_pretrained(
     "facebook/opt-350m", 
     quantization_config=quantization_config, 
-    torch_dtype=torch.float32
+    torch_dtype="auto"
 )
 model_8bit.model.decoder.layers[-1].final_layer_norm.weight.dtype
 ```
@@ -112,7 +112,7 @@ model_4bit = AutoModelForCausalLM.from_pretrained(
 )
 ```
 
-By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want:
+By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want. Setting `torch_dtype="auto"` loads the model in the data type defined in a model's `config.json` file.
 
 ```py
 import torch
@@ -123,7 +123,7 @@ quantization_config = BitsAndBytesConfig(load_in_4bit=True)
 model_4bit = AutoModelForCausalLM.from_pretrained(
     "facebook/opt-350m",
     quantization_config=quantization_config, 
-    torch_dtype=torch.float32
+    torch_dtype="auto"
 )
 model_4bit.model.decoder.layers[-1].final_layer_norm.weight.dtype
 ```
@@ -190,6 +190,7 @@ Now load your model with the custom `device_map` and `quantization_config`:
 ```py
 model_8bit = AutoModelForCausalLM.from_pretrained(
     "bigscience/bloom-1b7",
+    torch_dtype="auto",
     device_map=device_map,
     quantization_config=quantization_config,
 )
@@ -212,6 +213,7 @@ quantization_config = BitsAndBytesConfig(
 
 model_8bit = AutoModelForCausalLM.from_pretrained(
     model_id,
+    torch_dtype="auto",
     device_map=device_map,
     quantization_config=quantization_config,
 )
@@ -232,6 +234,7 @@ quantization_config = BitsAndBytesConfig(
 
 model_8bit = AutoModelForCausalLM.from_pretrained(
     model_id,
+    torch_dtype="auto",
     device_map="auto",
     quantization_config=quantization_config,
 )
@@ -275,7 +278,7 @@ nf4_config = BitsAndBytesConfig(
     bnb_4bit_quant_type="nf4",
 )
 
-model_nf4 = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config)
+model_nf4 = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", quantization_config=nf4_config)
 ```
 
 For inference, the `bnb_4bit_quant_type` does not have a huge impact on performance. However, to remain consistent with the model weights, you should use the `bnb_4bit_compute_dtype` and `torch_dtype` values.
@@ -292,7 +295,7 @@ double_quant_config = BitsAndBytesConfig(
     bnb_4bit_use_double_quant=True,
 )
 
-model_double_quant = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b", quantization_config=double_quant_config)
+model_double_quant = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b", torch_dtype="auto", quantization_config=double_quant_config)
 ```
 
 ## Dequantizing `bitsandbytes` models
diff --git a/docs/source/en/quantization/fbgemm_fp8.md b/docs/source/en/quantization/fbgemm_fp8.md
index ff9e18f823c935..61cf8a059bf277 100644
--- a/docs/source/en/quantization/fbgemm_fp8.md
+++ b/docs/source/en/quantization/fbgemm_fp8.md
@@ -33,13 +33,14 @@ pip install --upgrade accelerate fbgemm-gpu torch
 
 If you are having issues with fbgemm-gpu and torch library, you might need to install the nightly release. You can follow the instruction [here](https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-libraries:~:text=found%20here.-,Install%20the%20FBGEMM_GPU%20Package,-Install%20through%20PyTorch)
 
+By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type.
 
 ```py
 from transformers import FbgemmFp8Config, AutoModelForCausalLM, AutoTokenizer
 
 model_name = "meta-llama/Meta-Llama-3-8B"
 quantization_config = FbgemmFp8Config()
-quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=quantization_config)
+quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
 
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 input_text = "What are we having for dinner?"
diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md
index ef8ed444d9d49b..f3508aed0674f6 100644
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@@ -45,19 +45,20 @@ In short, supporting a wide range of quantization methods allows you to pick the
 
 Use the table below to help you decide which quantization method to use.
 
-| Quantization method                 | On the fly quantization | CPU | CUDA GPU | RoCm GPU (AMD) | Metal (Apple Silicon) | torch.compile() support | Number of bits | Supports fine-tuning (through PEFT) | Serializable with 🤗 transformers | 🤗 transformers support | Link to library                             |
-|-------------------------------------|-------------------------|-----|----------|----------------|-----------------------|-------------------------|----------------|-------------------------------------|--------------|------------------------|---------------------------------------------|
-| [AQLM](./aqlm)                                | 🔴                       |  🟢   |     🟢     | 🔴              | 🔴                     | 🟢                      | 1 / 2          | 🟢                                   | 🟢            | 🟢                      | https://github.com/Vahe1994/AQLM            |
-| [AWQ](./awq) | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | ?                       | 4              | 🟢                                   | 🟢            | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
-| [bitsandbytes](./bitsandbytes)     | 🟢            | 🟡 *   |     🟢     | 🟡 *            | 🔴 **    | 🔴    (soon!)          | 4 / 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/bitsandbytes-foundation/bitsandbytes |
-| [compressed-tensors](./compressed_tensors)                        | 🔴                       | 🟢   |     🟢     | 🟢              | 🔴                     | 🔴                       | 1 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
-| [EETQ](./eetq)                                | 🟢                       | 🔴   | 🟢        | 🔴              | 🔴                     | ?                       | 8              | 🟢                                   | 🟢            | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
-| GGUF / GGML (llama.cpp)             | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴                       | 1 - 8          | 🔴                                   | [See GGUF section](../gguf)                | [See GGUF section](../gguf)                      | https://github.com/ggerganov/llama.cpp      |
-| [GPTQ](./gptq)                                | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | 🔴                       | 2 - 3 - 4 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
-| [HQQ](./hqq)                                 | 🟢                       | 🟢    | 🟢        | 🔴              | 🔴                     | 🟢                       | 1 - 8          | 🟢                                   | 🔴            | 🟢                      | https://github.com/mobiusml/hqq/            |
-| [Quanto](./quanto)                              | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🟢                       | 2 / 4 / 8      | 🔴                                   | 🔴            | 🟢                      | https://github.com/huggingface/quanto       |
-| [FBGEMM_FP8](./fbgemm_fp8.md)                              | 🟢                       | 🔴    | 🟢        | 🔴              | 🔴                      | 🔴                        | 8      | 🔴                                   | 🟢            | 🟢                      | https://github.com/pytorch/FBGEMM       |
-| [torchao](./torchao.md)                              | 🟢                       |     | 🟢        | 🔴              | partial support (int4 weight only)       |                       | 4 / 8      |                                   | 🟢🔴           | 🟢                      | https://github.com/pytorch/ao       |
+| Quantization method                 | On the fly quantization | CPU | CUDA GPU | RoCm GPU (AMD) | Metal (Apple Silicon) | Intel GPU | torch.compile() support | Number of bits | Supports fine-tuning (through PEFT) | Serializable with 🤗 transformers | 🤗 transformers support | Link to library                             |
+|-------------------------------------|-------------------------|-----|----------|----------------|-----------------------|-----------|-------------------------|----------------|-------------------------------------|--------------|------------------------|---------------------------------------------|
+| [AQLM](./aqlm)                                | 🔴                       |  🟢   |     🟢     | 🔴              | 🔴                     | 🔴         | 🟢                      | 1 / 2          | 🟢                                   | 🟢            | 🟢                      | https://github.com/Vahe1994/AQLM            |
+| [AWQ](./awq) | 🔴                       | 🟢   | 🟢        | 🟢              | 🔴                     | 🟢         | ?                       | 4              | 🟢                                   | 🟢            | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
+| [bitsandbytes](./bitsandbytes)     | 🟢            | 🟡 *   |     🟢     | 🟡 *            | 🔴 **    | 🟡 *       | 🔴    (soon!)          | 4 / 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/bitsandbytes-foundation/bitsandbytes |
+| [compressed-tensors](./compressed_tensors)                        | 🔴                       | 🟢   |     🟢     | 🟢              | 🔴                     | 🔴         | 🔴                       | 1 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
+| [EETQ](./eetq)                                | 🟢                       | 🔴   | 🟢        | 🔴              | 🔴         | 🔴                     | ?                       | 8              | 🟢                                   | 🟢            | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
+| GGUF / GGML (llama.cpp)             | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴         | 🔴                       | 1 - 8          | 🔴                                   | [See GGUF section](../gguf)                | [See GGUF section](../gguf)                      | https://github.com/ggerganov/llama.cpp      |
+| [GPTQ](./gptq)                                | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | 🔴         | 🔴                       | 2 - 3 - 4 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
+| [HQQ](./hqq)                                 | 🟢                       | 🟢    | 🟢        | 🔴              | 🔴                     | 🔴         | 🟢                       | 1 - 8          | 🟢                                   | 🔴            | 🟢                      | https://github.com/mobiusml/hqq/            |
+| [optimum-quanto](./quanto)                              | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴         | 🟢                       | 2 / 4 / 8      | 🔴                                   | 🔴            | 🟢                      | https://github.com/huggingface/optimum-quanto       |
+| [FBGEMM_FP8](./fbgemm_fp8.md)                              | 🟢                       | 🔴    | 🟢        | 🔴              | 🔴                      | 🔴         | 🔴                        | 8      | 🔴                                   | 🟢            | 🟢                      | https://github.com/pytorch/FBGEMM       |
+| [torchao](./torchao.md)                              | 🟢                       |     | 🟢        | 🔴              | partial support (int4 weight only)       | 🔴         |                       | 4 / 8      |                                   | 🟢🔴           | 🟢                      | https://github.com/pytorch/ao       |
+| [VPTQ](./vptq)                      | 🔴                       |  🔴   |     🟢     | 🟡              | 🔴      | 🔴                | 🟢                      | 1 - 8          | 🔴                                   | 🟢            | 🟢                      | https://github.com/microsoft/VPTQ            |
 
 
 
@@ -71,4 +72,4 @@ We value your feedback to help identify bugs before the full release! Check out
 
 \** bitsandbytes is seeking contributors to help develop and lead the Apple Silicon backend. Interested? Contact them directly via their repo. Stipends may be available through sponsorships.
 
-
+
\ No newline at end of file
diff --git a/docs/source/en/quantization/quanto.md b/docs/source/en/quantization/quanto.md
index 18135b2ec2fce7..7feadefd83d2aa 100644
--- a/docs/source/en/quantization/quanto.md
+++ b/docs/source/en/quantization/quanto.md
@@ -14,21 +14,21 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Quanto
+# Optimum-quanto
 
 
 
-Try Quanto + transformers with this [notebook](https://colab.research.google.com/drive/16CXfVmtdQvciSh9BopZUDYcmXCDpvgrT?usp=sharing)!
+Try optimum-quanto + transformers with this [notebook](https://colab.research.google.com/drive/16CXfVmtdQvciSh9BopZUDYcmXCDpvgrT?usp=sharing)!
 
 
 
 
-[🤗 Quanto](https://github.com/huggingface/quanto) library is a versatile pytorch quantization toolkit. The quantization method used is the linear quantization. Quanto provides several unique features such as:
+[🤗 optimum-quanto](https://github.com/huggingface/optimum-quanto) library is a versatile pytorch quantization toolkit. The quantization method used is the linear quantization. Quanto provides several unique features such as:
 
 - weights quantization (`float8`,`int8`,`int4`,`int2`)
 - activation quantization (`float8`,`int8`)
 - modality agnostic (e.g CV,LLM)
-- device agnostic (e.g CUDA,MPS,CPU)
+- device agnostic (e.g CUDA,XPU,MPS,CPU)
 - compatibility with `torch.compile`
 - easy to add custom kernel for specific device
 - supports quantization aware training
@@ -37,12 +37,14 @@ Try Quanto + transformers with this [notebook](https://colab.research.google.com
 Before you begin, make sure the following libraries are installed:
 
 ```bash
-pip install quanto accelerate transformers
+pip install optimum-quanto accelerate transformers
 ```
 
 Now you can quantize a model by passing [`QuantoConfig`] object in the [`~PreTrainedModel.from_pretrained`] method. This works for any model in any modality, as long as it contains `torch.nn.Linear` layers. 
 
-The integration with transformers only supports weights quantization. For the more complex use case such as activation quantization, calibration and quantization aware training, you should use [quanto](https://github.com/huggingface/quanto) library instead. 
+The integration with transformers only supports weights quantization. For the more complex use case such as activation quantization, calibration and quantization aware training, you should use [optimum-quanto](https://github.com/huggingface/optimum-quanto) library instead.
+
+By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type.
 
 ```py
 from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
@@ -50,12 +52,12 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
 model_id = "facebook/opt-125m"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 quantization_config = QuantoConfig(weights="int8")
-quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda:0", quantization_config=quantization_config)
+quantized_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map="cuda:0", quantization_config=quantization_config)
 ```
 
 Note that serialization is not supported yet with transformers but it is coming soon! If you want to save the model, you can use quanto library instead.
 
-Quanto library uses linear quantization algorithm for quantization. Even though this is a basic quantization technique, we get very good results! Have a look at the following benchmark (llama-2-7b on perplexity metric). You can find more benchmarks [here](https://github.com/huggingface/quanto/tree/main/bench/generation)
+Optimum-quanto library uses linear quantization algorithm for quantization. Even though this is a basic quantization technique, we get very good results! Have a look at the following benchmark (llama-2-7b on perplexity metric). You can find more benchmarks [here](https://github.com/huggingface/optimum-quanto/tree/main/bench/generation)
 
 
diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md index cd1d0188c33eb5..38f7c074c97d90 100644 --- a/docs/source/en/quantization/torchao.md +++ b/docs/source/en/quantization/torchao.md @@ -19,6 +19,7 @@ Before you begin, make sure the following libraries are installed with their lat pip install --upgrade torch torchao ``` +By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type. ```py import torch @@ -28,7 +29,7 @@ model_name = "meta-llama/Meta-Llama-3-8B" # We support int4_weight_only, int8_weight_only and int8_dynamic_activation_int8_weight # More examples and documentations for arguments can be found in https://github.com/pytorch/ao/tree/main/torchao/quantization#other-available-quantization-techniques quantization_config = TorchAoConfig("int4_weight_only", group_size=128) -quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=quantization_config) +quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config) tokenizer = AutoTokenizer.from_pretrained(model_name) input_text = "What are we having for dinner?" diff --git a/docs/source/en/quantization/vptq.md b/docs/source/en/quantization/vptq.md new file mode 100644 index 00000000000000..b86e82f0a3503d --- /dev/null +++ b/docs/source/en/quantization/vptq.md @@ -0,0 +1,111 @@ + + +# VPTQ + +> [!TIP] +> Try VPTQ on [Hugging Face](https://huggingface.co/spaces/microsoft/VPTQ)! +> Try VPTQ on [Google Colab](https://colab.research.google.com/github/microsoft/VPTQ/blob/main/notebooks/vptq_example.ipynb)! +> Know more about VPTQ on [ArXiv](https://arxiv.org/pdf/2409.17066)! + +Vector Post-Training Quantization ([VPTQ](https://github.com/microsoft/VPTQ)) is a novel Post-Training Quantization method that leverages Vector Quantization to high accuracy on LLMs at an extremely low bit-width (<2-bit). VPTQ can compress 70B, even the 405B model, to 1-2 bits without retraining and maintain high accuracy. + +- Better Accuracy on 1-2 bits, (405B @ <2bit, 70B @ 2bit) +- Lightweight Quantization Algorithm: only cost ~17 hours to quantize 405B Llama-3.1 +- Agile Quantization Inference: low decode overhead, best throughput, and TTFT + +Inference support for VPTQ is released in the `vptq` library. Make sure to install it to run the models: +```bash +pip install vptq +``` + +The library provides efficient kernels for NVIDIA/AMD GPU inference. + +To run VPTQ models simply load a model that has been quantized with VPTQ: + +## Inference example +**Run Llama 3.1 70b on RTX4090 (24G @ ~2bits) in real time** +![Llama3 1-70b-prompt](https://github.com/user-attachments/assets/d8729aca-4e1d-4fe1-ac71-c14da4bdd97f) + + +```python +from transformers import AutoTokenizer, AutoModelForCausalLM + +quantized_model = AutoModelForCausalLM.from_pretrained( + "VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-65536-woft", + torch_dtype="auto", + device_map="auto" +) +tokenizer = AutoTokenizer.from_pretrained("VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-65536-woft") +input_ids = tokenizer("hello, it's me", return_tensors="pt").to("cuda") +out = model.generate(**input_ids, max_new_tokens=32, do_sample=False) +``` + +## Quantize your own model +VPTQ algorithm early-released at [VPTQ ](https://github.com/microsoft/VPTQ/tree/algorithm), +and checkout the [tutorial](https://github.com/microsoft/VPTQ/blob/algorithm/algorithm.md). + +## Early Results from Tech Report +VPTQ achieves better accuracy and higher throughput with lower quantization overhead across models of different sizes. The following experimental results are for reference only; VPTQ can achieve better outcomes under reasonable parameters, especially in terms of model accuracy and inference speed. + + +| Model | bitwidth | W2↓ | C4↓ | AvgQA↑ | tok/s↑ | mem(GB) | cost/h↓ | +| ----------- | -------- | ---- | ---- | ------ | ------ | ------- | ------- | +| LLaMA-2 7B | 2.02 | 6.13 | 8.07 | 58.2 | 39.9 | 2.28 | 2 | +| | 2.26 | 5.95 | 7.87 | 59.4 | 35.7 | 2.48 | 3.1 | +| LLaMA-2 13B | 2.02 | 5.32 | 7.15 | 62.4 | 26.9 | 4.03 | 3.2 | +| | 2.18 | 5.28 | 7.04 | 63.1 | 18.5 | 4.31 | 3.6 | +| LLaMA-2 70B | 2.07 | 3.93 | 5.72 | 68.6 | 9.7 | 19.54 | 19 | +| | 2.11 | 3.92 | 5.71 | 68.7 | 9.7 | 20.01 | 19 | + + + +## More Models in [VPTQ-community](https://huggingface.co/VPTQ-community) + +⚠️ The repository only provides a method of model quantization algorithm. + +⚠️ The open-source community VPTQ-community provides models based on the technical report and quantization algorithm. + + + +**Quick Estimation of Model Bitwidth (Excluding Codebook Overhead)**: + +- **Model Naming Convention**: The model's name includes the **vector length** $v$, **codebook (lookup table) size**, and **residual codebook size**. For example, "Meta-Llama-3.1-70B-Instruct-v8-k65536-256-woft" is "Meta-Llama-3.1-70B-Instruct", where: + - **Vector Length**: 8 + - **Number of Centroids**: 65536 (2^16) + - **Number of Residual Centroids**: 256 (2^8) +- **Equivalent Bitwidth Calculation**: + - **Index**: log2(65536) = 16 / 8 = 2 bits + - **Residual Index**: log2(256) = 8 / 8 = 1 bit + - **Total Bitwidth**: 2 + 1 = 3 bits +- **Model Size Estimation**: 70B * 3 bits / 8 bits per Byte = 26.25 GB + +- **Note**: This estimate does not include the size of the codebook (lookup table), other parameter overheads, and the padding overhead for storing indices. For the detailed calculation method, please refer to **Tech Report Appendix C.2**. + + +| Model Series | Collections | (Estimated) Bit per weight | +| :--------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------: | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Llama 3.1 Nemotron 70B Instruct HF | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-nemotron-70b-instruct-hf-without-finetune-671730b96f16208d0b3fe942) | [4 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v8-k65536-0-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-16384-woft) [1.625 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-1024-woft) [1.5 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-256-woft) | +| Llama 3.1 8B Instruct | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-8b-instruct-without-finetune-66f2b70b1d002ceedef02d2e) | [4 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-65536-woft) [3.5 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-4096-woft) [3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-256-woft) [2.3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v12-k65536-4096-woft) | +| Llama 3.1 70B Instruct | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-70b-instruct-without-finetune-66f2bf454d3dd78dfee2ff11) | [4 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-256-woft) [2.25 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-4-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-0-woft) [1.93 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-32768-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k32768-0-woft) [1.75 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k16384-0-woft) | +| Llama 3.1 405B Instruct | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-405b-instruct-without-finetune-66f4413f9ba55e1a9e52cfb0) | [4 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v8-k65536-256-woft) [2 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-65536-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k32768-32768-woft) [1.625 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-1024-woft) [1.5 bits (1)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v8-k4096-0-woft) [1.5 bits (2)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-256-woft) [1.43 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-128-woft) [1.375 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-64-woft) | +| Mistral Large Instruct 2407 (123B) | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-mistral-large-instruct-2407-without-finetune-6711ebfb7faf85eed9cceb16) | [4 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v8-k65536-0-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-16384-woft) [1.75 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-4096-woft) [1.625 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-1024-woft) [1.5 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-256-woft) | +| Qwen 2.5 7B Instruct | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-7b-instruct-without-finetune-66f3e9866d3167cc05ce954a) | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k256-256-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k65536-0-woft) [2 bits (3)](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v16-k65536-65536-woft) | +| Qwen 2.5 14B Instruct | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-14b-instruct-without-finetune-66f827f83c7ffa7931b8376c) | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k256-256-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k65536-0-woft) [2 bits (3)](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v16-k65536-65536-woft) | +| Qwen 2.5 32B Instruct | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-32b-instruct-without-finetune-66fe77173bf7d64139f0f613) | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-0-woft) [2 bits (3)](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k256-256-woft) | +| Qwen 2.5 72B Instruct | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-72b-instruct-without-finetune-66f3bf1b3757dfa1ecb481c0) | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-256-woft) [2.38 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k1024-512-woft) [2.25 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k512-512-woft) [2.25 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-4-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-0-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v16-k65536-65536-woft) [1.94 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v16-k65536-32768-woft) | +| Reproduced from the tech report | [HF 🤗](https://huggingface.co/collections/VPTQ-community/reproduced-vptq-tech-report-baseline-66fbf1dffe741cc9e93ecf04) | Results from the open source community for reference only, please use them responsibly. | +| Hessian and Inverse Hessian Matrix | [HF 🤗](https://huggingface.co/collections/VPTQ-community/hessian-and-invhessian-checkpoints-66fd249a104850d17b23fd8b) | Collected from RedPajama-Data-1T-Sample, following [Quip#](https://github.com/Cornell-RelaxML/quip-sharp/blob/main/quantize_llama/hessian_offline_llama.py) \ No newline at end of file diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md index 404b6eac7fe44b..70afa1ea57107f 100755 --- a/docs/source/en/quicktour.md +++ b/docs/source/en/quicktour.md @@ -245,13 +245,15 @@ Check out the [preprocess](./preprocessing) tutorial for more details about toke -🤗 Transformers provides a simple and unified way to load pretrained instances. This means you can load an [`AutoModel`] like you would load an [`AutoTokenizer`]. The only difference is selecting the correct [`AutoModel`] for the task. For text (or sequence) classification, you should load [`AutoModelForSequenceClassification`]: +🤗 Transformers provides a simple and unified way to load pretrained instances. This means you can load an [`AutoModel`] like you would load an [`AutoTokenizer`]. The only difference is selecting the correct [`AutoModel`] for the task. For text (or sequence) classification, you should load [`AutoModelForSequenceClassification`]. + +By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type. ```py >>> from transformers import AutoModelForSequenceClassification >>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment" ->>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name) +>>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name, torch_dtype="auto") ``` @@ -416,12 +418,12 @@ All models are a standard [`torch.nn.Module`](https://pytorch.org/docs/stable/nn Depending on your task, you'll typically pass the following parameters to [`Trainer`]: -1. You'll start with a [`PreTrainedModel`] or a [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module): +1. You'll start with a [`PreTrainedModel`] or a [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module). Set `torch_dtype="auto"` to automatically load the most memory-efficient data type the weights are stored in. ```py >>> from transformers import AutoModelForSequenceClassification - >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") + >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype="auto") ``` 2. [`TrainingArguments`] contains the model hyperparameters you can change like learning rate, batch size, and the number of epochs to train for. The default values are used if you don't specify any training arguments: diff --git a/docs/source/en/tasks/asr.md b/docs/source/en/tasks/asr.md index f3e068444ca556..e8884d327b565b 100644 --- a/docs/source/en/tasks/asr.md +++ b/docs/source/en/tasks/asr.md @@ -20,12 +20,12 @@ rendered properly in your Markdown viewer. -Automatic speech recognition (ASR) converts a speech signal to text, mapping a sequence of audio inputs to text outputs. Virtual assistants like Siri and Alexa use ASR models to help users everyday, and there are many other useful user-facing applications like live captioning and note-taking during meetings. +Automatic speech recognition (ASR) converts a speech signal to text, mapping a sequence of audio inputs to text outputs. Virtual assistants like Siri and Alexa use ASR models to help users every day, and there are many other useful user-facing applications like live captioning and note-taking during meetings. This guide will show you how to: -1. Finetune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset to transcribe audio to text. -2. Use your finetuned model for inference. +1. Fine-tune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset to transcribe audio to text. +2. Use your fine-tuned model for inference. @@ -49,7 +49,7 @@ We encourage you to login to your Hugging Face account so you can upload and sha ## Load MInDS-14 dataset -Start by loading a smaller subset of the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset from the 🤗 Datasets library. This'll give you a chance to experiment and make sure everything works before spending more time training on the full dataset. +Start by loading a smaller subset of the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset from the 🤗 Datasets library. This will give you a chance to experiment and make sure everything works before spending more time training on the full dataset. ```py >>> from datasets import load_dataset, Audio @@ -79,13 +79,13 @@ DatasetDict({ }) ``` -While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, you'll focus on the `audio` and `transcription` in this guide. Remove the other columns with the [`~datasets.Dataset.remove_columns`] method: +While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, this guide focuses on the `audio` and `transcription`. Remove the other columns with the [`~datasets.Dataset.remove_columns`] method: ```py >>> minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"]) ``` -Take a look at the example again: +Review the example again: ```py >>> minds["train"][0] @@ -112,7 +112,7 @@ The next step is to load a Wav2Vec2 processor to process the audio signal: >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base") ``` -The MInDS-14 dataset has a sampling rate of 8000kHz (you can find this information in its [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16000kHz to use the pretrained Wav2Vec2 model: +The MInDS-14 dataset has a sampling rate of 8000Hz (you can find this information in its [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16000Hz to use the pretrained Wav2Vec2 model: ```py >>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000)) @@ -125,7 +125,7 @@ The MInDS-14 dataset has a sampling rate of 8000kHz (you can find this informati 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"} ``` -As you can see in the `transcription` above, the text contains a mix of upper and lowercase characters. The Wav2Vec2 tokenizer is only trained on uppercase characters so you'll need to make sure the text matches the tokenizer's vocabulary: +As you can see in the `transcription` above, the text contains a mix of uppercase and lowercase characters. The Wav2Vec2 tokenizer is only trained on uppercase characters so you'll need to make sure the text matches the tokenizer's vocabulary: ```py >>> def uppercase(example): @@ -196,7 +196,7 @@ Now instantiate your `DataCollatorForCTCWithPadding`: ## Evaluate -Including a metric during training is often helpful for evaluating your model's performance. You can quickly load an evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [word error rate](https://huggingface.co/spaces/evaluate-metric/wer) (WER) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric): +Including a metric during training is often helpful for evaluating your model's performance. You can quickly load an evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [word error rate](https://huggingface.co/spaces/evaluate-metric/wer) (WER) metric (refer to the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about loading and computing metrics): ```py >>> import evaluate @@ -236,7 +236,7 @@ If you aren't familiar with finetuning a model with the [`Trainer`], take a look -You're ready to start training your model now! Load Wav2Vec2 with [`AutoModelForCTC`]. Specify the reduction to apply with the `ctc_loss_reduction` parameter. It is often better to use the average instead of the default summation: +You are now ready to start training your model! Load Wav2Vec2 with [`AutoModelForCTC`]. Specify the reduction to apply with the `ctc_loss_reduction` parameter. It is often better to use the average instead of the default summation: ```py >>> from transformers import AutoModelForCTC, TrainingArguments, Trainer @@ -252,7 +252,7 @@ At this point, only three steps remain: 1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the WER and save the training checkpoint. 2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function. -3. Call [`~Trainer.train`] to finetune your model. +3. Call [`~Trainer.train`] to fine-tune your model. ```py >>> training_args = TrainingArguments( @@ -289,7 +289,7 @@ At this point, only three steps remain: >>> trainer.train() ``` -Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model: +Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so it can be accessible to everyone: ```py >>> trainer.push_to_hub() @@ -299,13 +299,13 @@ Once training is completed, share your model to the Hub with the [`~transformers -For a more in-depth example of how to finetune a model for automatic speech recognition, take a look at this blog [post](https://huggingface.co/blog/fine-tune-wav2vec2-english) for English ASR and this [post](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) for multilingual ASR. +For a more in-depth example of how to fine-tune a model for automatic speech recognition, take a look at this blog [post](https://huggingface.co/blog/fine-tune-wav2vec2-english) for English ASR and this [post](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) for multilingual ASR. ## Inference -Great, now that you've finetuned a model, you can use it for inference! +Great, now that you've fine-tuned a model, you can use it for inference! Load an audio file you'd like to run inference on. Remember to resample the sampling rate of the audio file to match the sampling rate of the model if you need to! @@ -318,7 +318,7 @@ Load an audio file you'd like to run inference on. Remember to resample the samp >>> audio_file = dataset[0]["audio"]["path"] ``` -The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for automatic speech recognition with your model, and pass your audio file to it: +The simplest way to try out your fine-tuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for automatic speech recognition with your model, and pass your audio file to it: ```py >>> from transformers import pipeline diff --git a/docs/source/en/tasks/audio_classification.md b/docs/source/en/tasks/audio_classification.md index 59d6a175da82ba..973f95e1e9555d 100644 --- a/docs/source/en/tasks/audio_classification.md +++ b/docs/source/en/tasks/audio_classification.md @@ -9,7 +9,7 @@ Unless required by applicable law or agreed to in writing, software distributed an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be +⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be rendered properly in your Markdown viewer. --> @@ -20,12 +20,12 @@ rendered properly in your Markdown viewer. -Audio classification - just like with text - assigns a class label output from the input data. The only difference is instead of text inputs, you have raw audio waveforms. Some practical applications of audio classification include identifying speaker intent, language classification, and even animal species by their sounds. +Audio classification - just like with text - assigns a class label as output from the input data. The only difference is instead of text inputs, you have raw audio waveforms. Some practical applications of audio classification include identifying speaker intent, language classification, and even animal species by their sounds. This guide will show you how to: -1. Finetune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset to classify speaker intent. -2. Use your finetuned model for inference. +1. Fine-tune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset to classify speaker intent. +2. Use your fine-tuned model for inference. @@ -57,7 +57,7 @@ Start by loading the MInDS-14 dataset from the 🤗 Datasets library: >>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train") ``` -Split the dataset's `train` split into a smaller train and test set with the [`~datasets.Dataset.train_test_split`] method. This'll give you a chance to experiment and make sure everything works before spending more time on the full dataset. +Split the dataset's `train` split into a smaller train and test set with the [`~datasets.Dataset.train_test_split`] method. This will give you a chance to experiment and make sure everything works before spending more time on the full dataset. ```py >>> minds = minds.train_test_split(test_size=0.2) @@ -79,13 +79,13 @@ DatasetDict({ }) ``` -While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, you'll focus on the `audio` and `intent_class` in this guide. Remove the other columns with the [`~datasets.Dataset.remove_columns`] method: +While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, you will focus on the `audio` and `intent_class` in this guide. Remove the other columns with the [`~datasets.Dataset.remove_columns`] method: ```py >>> minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"]) ``` -Take a look at an example now: +Here's an example: ```py >>> minds["train"][0] @@ -128,7 +128,7 @@ The next step is to load a Wav2Vec2 feature extractor to process the audio signa >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") ``` -The MInDS-14 dataset has a sampling rate of 8000khz (you can find this information in it's [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16000kHz to use the pretrained Wav2Vec2 model: +The MInDS-14 dataset has a sampling rate of 8kHz (you can find this information in its [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16kHz to use the pretrained Wav2Vec2 model: ```py >>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000)) @@ -155,7 +155,7 @@ Now create a preprocessing function that: ... return inputs ``` -To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up `map` by setting `batched=True` to process multiple elements of the dataset at once. Remove the columns you don't need, and rename `intent_class` to `label` because that's the name the model expects: +To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up `map` by setting `batched=True` to process multiple elements of the dataset at once. Remove unnecessary columns and rename `intent_class` to `label`, as required by the model: ```py >>> encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True) @@ -208,9 +208,9 @@ You're ready to start training your model now! Load Wav2Vec2 with [`AutoModelFor At this point, only three steps remain: -1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint. +1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir`, which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint. 2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function. -3. Call [`~Trainer.train`] to finetune your model. +3. Call [`~Trainer.train`] to fine-tune your model. ```py @@ -252,15 +252,15 @@ Once training is completed, share your model to the Hub with the [`~transformers -For a more in-depth example of how to finetune a model for audio classification, take a look at the corresponding [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb). +For a more in-depth example of how to fine-tune a model for audio classification, take a look at the corresponding [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb). ## Inference -Great, now that you've finetuned a model, you can use it for inference! +Great, now that you've fine-tuned a model, you can use it for inference! -Load an audio file you'd like to run inference on. Remember to resample the sampling rate of the audio file to match the sampling rate of the model if you need to! +Load an audio file for inference. Remember to resample the sampling rate of the audio file to match the model's sampling rate, if necessary. ```py >>> from datasets import load_dataset, Audio @@ -271,7 +271,7 @@ Load an audio file you'd like to run inference on. Remember to resample the samp >>> audio_file = dataset[0]["audio"]["path"] ``` -The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for audio classification with your model, and pass your audio file to it: +The simplest way to try out your fine-tuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for audio classification with your model, and pass your audio file to it: ```py >>> from transformers import pipeline diff --git a/docs/source/en/tasks/idefics.md b/docs/source/en/tasks/idefics.md index a780124edea9c6..7e3335762ea43b 100644 --- a/docs/source/en/tasks/idefics.md +++ b/docs/source/en/tasks/idefics.md @@ -386,9 +386,9 @@ The use and prompting for the conversational use is very similar to using the ba ```py >>> import torch >>> from transformers import IdeficsForVisionText2Text, AutoProcessor +>>> from accelerate.test_utils.testing import get_backend ->>> device = "cuda" if torch.cuda.is_available() else "cpu" - +>>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) >>> checkpoint = "HuggingFaceM4/idefics-9b-instruct" >>> model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16).to(device) >>> processor = AutoProcessor.from_pretrained(checkpoint) diff --git a/docs/source/en/tasks/image_captioning.md b/docs/source/en/tasks/image_captioning.md index 633ccc491ebb35..9a78967cb5198d 100644 --- a/docs/source/en/tasks/image_captioning.md +++ b/docs/source/en/tasks/image_captioning.md @@ -256,8 +256,9 @@ image Prepare image for the model. ```python -device = "cuda" if torch.cuda.is_available() else "cpu" - +from accelerate.test_utils.testing import get_backend +# automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) +device, _, _ = get_backend() inputs = processor(images=image, return_tensors="pt").to(device) pixel_values = inputs.pixel_values ``` diff --git a/docs/source/en/tasks/image_classification.md b/docs/source/en/tasks/image_classification.md index 514ec3fbfe0b93..49fdc9db60d4d7 100644 --- a/docs/source/en/tasks/image_classification.md +++ b/docs/source/en/tasks/image_classification.md @@ -26,7 +26,7 @@ after a natural disaster, monitoring crop health, or helping screen medical imag This guide illustrates how to: -1. Fine-tune [ViT](model_doc/vit) on the [Food-101](https://huggingface.co/datasets/food101) dataset to classify a food item in an image. +1. Fine-tune [ViT](../model_doc/vit) on the [Food-101](https://huggingface.co/datasets/food101) dataset to classify a food item in an image. 2. Use your fine-tuned model for inference. diff --git a/docs/source/en/tasks/image_feature_extraction.md b/docs/source/en/tasks/image_feature_extraction.md index c9d794b0b2be38..e55a9e2379d531 100644 --- a/docs/source/en/tasks/image_feature_extraction.md +++ b/docs/source/en/tasks/image_feature_extraction.md @@ -43,8 +43,9 @@ Let's see the pipeline in action. First, initialize the pipeline. If you don't p ```python import torch from transformers import pipeline - -DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +from accelerate.test_utils.testing import get_backend +# automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) +DEVICE, _, _ = get_backend() pipe = pipeline(task="image-feature-extraction", model_name="google/vit-base-patch16-384", device=DEVICE, pool=True) ``` @@ -83,7 +84,7 @@ If you want to get the last hidden states before pooling, avoid passing any valu ```python pipe = pipeline(task="image-feature-extraction", model_name="google/vit-base-patch16-224", device=DEVICE) -output = pipe(image_real) +outputs = pipe(image_real) ``` Since the outputs are unpooled, we get the last hidden states where the first dimension is the batch size, and the last two are the embedding shape. diff --git a/docs/source/en/tasks/image_text_to_text.md b/docs/source/en/tasks/image_text_to_text.md index 261abf947290d1..28bd98457ee016 100644 --- a/docs/source/en/tasks/image_text_to_text.md +++ b/docs/source/en/tasks/image_text_to_text.md @@ -120,6 +120,46 @@ print(generated_texts) ## ['User: What do we see in this image? \nAssistant: In this image we can see two cats on the nets. \nUser: And how about this image? \nAssistant: In this image we can see flowers, plants and insect.'] ``` +## Pipeline + +The fastest way to get started is to use the [`Pipeline`] API. Specify the `"image-text-to-text"` task and the model you want to use. + +```python +from transformers import pipeline +pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf") +``` + +The example below uses chat templates to format the text inputs. + +```python +messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg", + }, + {"type": "text", "text": "Describe this image."}, + ], + }, + { + "role": "assistant", + "content": [ + {"type": "text", "text": "There's a pink flower"}, + ], + }, + ] +``` + +Pass the chat template formatted text and image to [`Pipeline`] and set `return_full_text=False` to remove the input from the generated output. + +```python +outputs = pipe(text=messages, max_new_tokens=20, return_full_text=False) +outputs[0]["generated_text"] +# with a yellow center in the foreground. The flower is surrounded by red and white flowers with green stems +``` + ## Streaming We can use [text streaming](./generation_strategies#streaming) for a better generation experience. Transformers supports streaming with the [`TextStreamer`] or [`TextIteratorStreamer`] classes. We will use the [`TextIteratorStreamer`] with IDEFICS-8B. @@ -189,7 +229,7 @@ Now let's call the `model_inference` function we created and stream the values. ```python generator = model_inference( user_prompt="And what is in this image?", - chat_history=messages, + chat_history=messages[:2], max_new_tokens=100, images=images ) diff --git a/docs/source/en/tasks/image_to_image.md b/docs/source/en/tasks/image_to_image.md index 0bb74b36980e0b..f1c62e47aebf24 100644 --- a/docs/source/en/tasks/image_to_image.md +++ b/docs/source/en/tasks/image_to_image.md @@ -37,8 +37,9 @@ We can now initialize the pipeline with a [Swin2SR model](https://huggingface.co ```python from transformers import pipeline import torch - -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +from accelerate.test_utils.testing import get_backend +# automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) +device, _, _ = get_backend() pipe = pipeline(task="image-to-image", model="caidas/swin2SR-lightweight-x2-64", device=device) ``` diff --git a/docs/source/en/tasks/knowledge_distillation_for_image_classification.md b/docs/source/en/tasks/knowledge_distillation_for_image_classification.md index 530e92d81f5c0d..c1ccafb6fc5d2a 100644 --- a/docs/source/en/tasks/knowledge_distillation_for_image_classification.md +++ b/docs/source/en/tasks/knowledge_distillation_for_image_classification.md @@ -17,7 +17,7 @@ rendered properly in your Markdown viewer. [[open-in-colab]] -Knowledge distillation is a technique used to transfer knowledge from a larger, more complex model (teacher) to a smaller, simpler model (student). To distill knowledge from one model to another, we take a pre-trained teacher model trained on a certain task (image classification for this case) and randomly initialize a student model to be trained on image classification. Next, we train the student model to minimize the difference between it's outputs and the teacher's outputs, thus making it mimic the behavior. It was first introduced in [Distilling the Knowledge in a Neural Network by Hinton et al](https://arxiv.org/abs/1503.02531). In this guide, we will do task-specific knowledge distillation. We will use the [beans dataset](https://huggingface.co/datasets/beans) for this. +Knowledge distillation is a technique used to transfer knowledge from a larger, more complex model (teacher) to a smaller, simpler model (student). To distill knowledge from one model to another, we take a pre-trained teacher model trained on a certain task (image classification for this case) and randomly initialize a student model to be trained on image classification. Next, we train the student model to minimize the difference between its outputs and the teacher's outputs, thus making it mimic the behavior. It was first introduced in [Distilling the Knowledge in a Neural Network by Hinton et al](https://arxiv.org/abs/1503.02531). In this guide, we will do task-specific knowledge distillation. We will use the [beans dataset](https://huggingface.co/datasets/beans) for this. This guide demonstrates how you can distill a [fine-tuned ViT model](https://huggingface.co/merve/vit-mobilenet-beans-224) (teacher model) to a [MobileNet](https://huggingface.co/google/mobilenet_v2_1.4_224) (student model) using the [Trainer API](https://huggingface.co/docs/transformers/en/main_classes/trainer#trainer) of 🤗 Transformers. @@ -58,7 +58,7 @@ from transformers import TrainingArguments, Trainer import torch import torch.nn as nn import torch.nn.functional as F - +from accelerate.test_utils.testing import get_backend class ImageDistilTrainer(Trainer): def __init__(self, teacher_model=None, student_model=None, temperature=None, lambda_param=None, *args, **kwargs): @@ -66,7 +66,7 @@ class ImageDistilTrainer(Trainer): self.teacher = teacher_model self.student = student_model self.loss_function = nn.KLDivLoss(reduction="batchmean") - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) self.teacher.to(device) self.teacher.eval() self.temperature = temperature diff --git a/docs/source/en/tasks/mask_generation.md b/docs/source/en/tasks/mask_generation.md index 82202f58bca607..db16e035e303e0 100644 --- a/docs/source/en/tasks/mask_generation.md +++ b/docs/source/en/tasks/mask_generation.md @@ -125,9 +125,9 @@ the processor. ```python from transformers import SamModel, SamProcessor import torch - -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - +from accelerate.test_utils.testing import get_backend +# automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) +device, _, _ = get_backend() model = SamModel.from_pretrained("facebook/sam-vit-base").to(device) processor = SamProcessor.from_pretrained("facebook/sam-vit-base") ``` diff --git a/docs/source/en/tasks/monocular_depth_estimation.md b/docs/source/en/tasks/monocular_depth_estimation.md index 3ded3179154aae..edd22122f32bd6 100644 --- a/docs/source/en/tasks/monocular_depth_estimation.md +++ b/docs/source/en/tasks/monocular_depth_estimation.md @@ -53,8 +53,9 @@ Instantiate a pipeline from a [checkpoint on the Hugging Face Hub](https://huggi ```py >>> from transformers import pipeline >>> import torch - ->>> device = "cuda" if torch.cuda.is_available() else "cpu" +>>> from accelerate.test_utils.testing import get_backend +# automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) +>>> device, _, _ = get_backend() >>> checkpoint = "depth-anything/Depth-Anything-V2-base-hf" >>> pipe = pipeline("depth-estimation", model=checkpoint, device=device) ``` diff --git a/docs/source/en/tasks/multiple_choice.md b/docs/source/en/tasks/multiple_choice.md index 06eb45eda99150..18b12f2166637e 100644 --- a/docs/source/en/tasks/multiple_choice.md +++ b/docs/source/en/tasks/multiple_choice.md @@ -419,7 +419,7 @@ Get the class with the highest probability: ```py >>> predicted_class = logits.argmax().item() >>> predicted_class -'0' +0 ``` @@ -448,7 +448,7 @@ Get the class with the highest probability: ```py >>> predicted_class = int(tf.math.argmax(logits, axis=-1)[0]) >>> predicted_class -'0' +0 ``` diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md index fdc81896bc1924..c307dd3334fe92 100644 --- a/docs/source/en/tasks/object_detection.md +++ b/docs/source/en/tasks/object_detection.md @@ -1488,7 +1488,9 @@ Now that you have finetuned a model, evaluated it, and uploaded it to the Huggin Load model and image processor from the Hugging Face Hub (skip to use already trained in this session): ```py ->>> device = "cuda" +>>> from accelerate.test_utils.testing import get_backend +# automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) +>>> device, _, _ = get_backend() >>> model_repo = "qubvel-hf/detr_finetuned_cppe5" >>> image_processor = AutoImageProcessor.from_pretrained(model_repo) diff --git a/docs/source/en/tasks/question_answering.md b/docs/source/en/tasks/question_answering.md index 998010e67ca95f..41d7fd48cf816e 100644 --- a/docs/source/en/tasks/question_answering.md +++ b/docs/source/en/tasks/question_answering.md @@ -325,7 +325,7 @@ or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/no Evaluation for question answering requires a significant amount of postprocessing. To avoid taking up too much of your time, this guide skips the evaluation step. The [`Trainer`] still calculates the evaluation loss during training so you're not completely in the dark about your model's performance. -If have more time and you're interested in how to evaluate your model for question answering, take a look at the [Question answering](https://huggingface.co/course/chapter7/7?fw=pt#post-processing) chapter from the 🤗 Hugging Face Course! +If you have more time and you're interested in how to evaluate your model for question answering, take a look at the [Question answering](https://huggingface.co/course/chapter7/7?fw=pt#post-processing) chapter from the 🤗 Hugging Face Course! ## Inference @@ -397,7 +397,7 @@ Tokenize the text and return TensorFlow tensors: >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model") ->>> inputs = tokenizer(question, text, return_tensors="tf") +>>> inputs = tokenizer(question, context, return_tensors="tf") ``` Pass your inputs to the model and return the `logits`: diff --git a/docs/source/en/tasks/semantic_segmentation.md b/docs/source/en/tasks/semantic_segmentation.md index 912577589486ce..a21ff62edf1a56 100644 --- a/docs/source/en/tasks/semantic_segmentation.md +++ b/docs/source/en/tasks/semantic_segmentation.md @@ -689,7 +689,9 @@ Reload the dataset and load an image for inference. We will now see how to infer without a pipeline. Process the image with an image processor and place the `pixel_values` on a GPU: ```py ->>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # use GPU if available, otherwise use a CPU +>>> from accelerate.test_utils.testing import get_backend +# automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) +>>> device, _, _ = get_backend() >>> encoding = image_processor(image, return_tensors="pt") >>> pixel_values = encoding.pixel_values.to(device) ``` diff --git a/docs/source/en/tasks/summarization.md b/docs/source/en/tasks/summarization.md index 7d7ecf1fbab6db..e16dd17dfe1fc8 100644 --- a/docs/source/en/tasks/summarization.md +++ b/docs/source/en/tasks/summarization.md @@ -283,7 +283,7 @@ Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]: ```py >>> from transformers.keras_callbacks import KerasMetricCallback ->>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) +>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set) ``` Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]: diff --git a/docs/source/en/tasks/text-to-speech.md b/docs/source/en/tasks/text-to-speech.md index 188d4ea5f9ee68..e25da4e19efeaa 100644 --- a/docs/source/en/tasks/text-to-speech.md +++ b/docs/source/en/tasks/text-to-speech.md @@ -282,10 +282,10 @@ containing the corresponding speaker embedding. >>> import os >>> import torch >>> from speechbrain.inference.classifiers import EncoderClassifier +>>> from accelerate.test_utils.testing import get_backend >>> spk_model_name = "speechbrain/spkrec-xvect-voxceleb" - ->>> device = "cuda" if torch.cuda.is_available() else "cpu" +>>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.) >>> speaker_model = EncoderClassifier.from_hparams( ... source=spk_model_name, ... run_opts={"device": device}, diff --git a/docs/source/en/tasks/translation.md b/docs/source/en/tasks/translation.md index 426ba1c340fb81..922cdc7241176a 100644 --- a/docs/source/en/tasks/translation.md +++ b/docs/source/en/tasks/translation.md @@ -290,7 +290,7 @@ Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]: ```py >>> from transformers.keras_callbacks import KerasMetricCallback ->>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) +>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set) ``` Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]: diff --git a/docs/source/en/tasks/video_text_to_text.md b/docs/source/en/tasks/video_text_to_text.md index fcc1c86e8bd7ac..3929f7994bdafb 100644 --- a/docs/source/en/tasks/video_text_to_text.md +++ b/docs/source/en/tasks/video_text_to_text.md @@ -47,7 +47,7 @@ model_id = "llava-hf/llava-interleave-qwen-0.5b-hf" processor = LlavaProcessor.from_pretrained(model_id) model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16) -model.to("cuda") +model.to("cuda") # can also be xpu, mps, npu etc. depending on your hardware accelerator ``` Some models directly consume the `