From 250f6d88e6fbb6b1179f681bef2bbdb7095d3a55 Mon Sep 17 00:00:00 2001 From: dhuangnm <74931910+dhuangnm@users.noreply.github.com> Date: Mon, 24 Jun 2024 15:02:30 -0400 Subject: [PATCH 1/2] bump version to 0.5.1 (#330) Co-authored-by: dhuangnm --- vllm/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/version.py b/vllm/version.py index db951e35dac97..368411c8059c8 100644 --- a/vllm/version.py +++ b/vllm/version.py @@ -1,2 +1,2 @@ # UPSTREAM SYNC: take downstream -__version__ = "0.5.0" +__version__ = "0.5.1" From 05c3004489c0c0501dba5fab72b254e3014f1ae5 Mon Sep 17 00:00:00 2001 From: Andy Linfoot <78757007+andy-neuma@users.noreply.github.com> Date: Mon, 24 Jun 2024 21:10:12 -0400 Subject: [PATCH 2/2] cross python whl (#315) SUMMARY: * transition workflows to one cross python whl. minimum supported python is 3.8. * adjustments to actions resulting from using one whl * adjustments to workflows resulting from using one whl * updating parameters to enable passing in an "array" of test labels, python versions, and tests to run. * upload whl to GCP storage bucket organized by run id. this will be used in a later PR to allow us to install and run from GCP directly. * deletion of WEEKLY and RELEASE workflows. these can now be handled via the NIGHTLY by just specifying different input parameter. * removal of "WEEKLY" workflow category. we should just push this semantic into "RELEASE". * add "pytest skip if" for some tests to avoid running them on T4's (<8.0 compute capability) * "remote push" will now run on "l4 solo" and "l4 duo". there are still some issues with the T4's, so moving all jobs to run on L4's. we should address the T4 issues in a separate PR. * adding "python cache" semantics. this roughly cuts build times in half. building the whl now takes about 25 minutes. this is inline with the times when we were using GCP static runners. * clean up "publish binaries" parameter. moved it to a boolean and set defaults to `false`. this makes everything cleaner and also cleans up the GHA UI when manually triggering workflows. * remove reference to skip list in "run tests" TEST PLAN: runs on remote push --------- Co-authored-by: andy-neuma --- .github/actions/nm-build-vllm/action.yml | 37 +----- .github/actions/nm-caches/action.yml | 11 ++ .github/actions/nm-hf-cache/action.yml | 13 --- .github/actions/nm-install-whl/action.yml | 3 +- .github/actions/nm-set-env/action.yml | 2 + .github/actions/nm-summary-build/action.yml | 4 - .github/actions/nm-test-whl/action.yml | 4 +- .github/scripts/run-tests | 34 ------ .github/workflows/nm-benchmark.yml | 17 +-- .github/workflows/nm-build-test.yml | 95 ++++++++------- .github/workflows/nm-build.yml | 33 +++++- .github/workflows/nm-lm-eval.yml | 8 +- .github/workflows/nm-nightly.yml | 109 ++++------------- .github/workflows/nm-release.yml | 110 ------------------ .github/workflows/nm-remote-push.yml | 74 +----------- .github/workflows/nm-test.yml | 12 +- .github/workflows/nm-upload-assets-to-gcp.yml | 6 + .github/workflows/nm-weekly.yml | 41 ------- tests/engine/test_stop_strings.py | 9 ++ .../test_llm_generate_multiple_loras.py | 6 + tests/models_core/test_llm_logprobs.py | 3 + tests/models_core/test_magic_wand.py | 7 +- tests/models_core/test_server_logprobs.py | 2 + .../test_disable_sliding_window.py | 6 + tests/quantization/test_configs.py | 3 + tests/test_regression.py | 6 + 26 files changed, 191 insertions(+), 464 deletions(-) create mode 100644 .github/actions/nm-caches/action.yml delete mode 100644 .github/actions/nm-hf-cache/action.yml delete mode 100644 .github/workflows/nm-release.yml delete mode 100644 .github/workflows/nm-weekly.yml diff --git a/.github/actions/nm-build-vllm/action.yml b/.github/actions/nm-build-vllm/action.yml index c78a9a8b27d65..fc629242248fb 100644 --- a/.github/actions/nm-build-vllm/action.yml +++ b/.github/actions/nm-build-vllm/action.yml @@ -7,15 +7,9 @@ inputs: venv: description: 'name for python virtual environment' required: true - pypi: - description: 'ip address for pypi server' - required: true outputs: - build_status: - description: "final status from 'pip install -e'" - value: ${{ steps.build.outputs.build_status }} whl_status: - description: "final status from 'pip3 wheel --no-deps -w dist'" + description: "final status from constructing the whl" value: ${{ steps.build.outputs.whl_status }} whl: description: 'basename for generated whl' @@ -31,35 +25,16 @@ runs: COMMIT=${{ github.sha }} VENV="${{ inputs.venv }}-${COMMIT:0:7}" source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate - # TODO: adjust when we need a proper release. use nightly now. pip3 install -r requirements-cuda.txt -r requirements-build.txt - # build - SUCCESS=0 - pip3 install -e . || SUCCESS=$? - echo "build_status=${SUCCESS}" >> "$GITHUB_OUTPUT" - if [ ${SUCCESS} -ne 0 ]; then - exit 1 - fi - # strip binaries - if [ ! $(command -v strip) ]; then - sudo apt install -y binutils - fi - if [ ! $(command -v file) ]; then - sudo apt install -y file - fi - for eachso in $(find . -type f -name '*.so') - do - strip $eachso - file $eachso - done # whl SUCCESS=0 - pip3 wheel --no-deps -w dist . || SUCCESS=$? + python setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 || SUCCESS=$? echo "whl_status=${SUCCESS}" >> "$GITHUB_OUTPUT" - BASE=$(./.github/scripts/convert-version ${{ inputs.python }}) ls -alh dist - WHL_FILEPATH=$(find dist -iname "*${BASE}*.whl") + WHL_FILEPATH=$(find dist -type f -iname "*linux_x86_64.whl") + echo "whl: ${WHL_FILEPATH}" RENAME=$(echo ${WHL_FILEPATH} | sed -e 's/linux_x86_64/manylinux_2_17_x86_64/') + echo "rename: ${RENAME}" mv ${WHL_FILEPATH} ${RENAME} WHL=$(basename ${RENAME}) echo "whl=${WHL}" >> "$GITHUB_OUTPUT" @@ -67,7 +42,7 @@ runs: exit 1 fi # sdist - python3 setup.py sdist || SUCCESS=$? + python setup.py sdist || SUCCESS=$? pyenv uninstall --force ${{ inputs.python}}/envs/${VENV} ls -alh dist TAR_FILEPATH=$(find dist -type f -iname "*.tar.gz") diff --git a/.github/actions/nm-caches/action.yml b/.github/actions/nm-caches/action.yml new file mode 100644 index 0000000000000..db4db069a7d7a --- /dev/null +++ b/.github/actions/nm-caches/action.yml @@ -0,0 +1,11 @@ +name: set up caches +description: 'set up HF and Python caches' +runs: + using: composite + steps: + - run: | + sudo mkdir -m 777 -p ${HF_HOME} + sudo chown -R $(whoami):$(whoami) ${HF_HOME} + sudo mkdir -m 777 -p ${PIP_CACHE_DIR} + sudo chown -R $(whoami):$(whoami) ${PIP_CACHE_DIR} + shell: bash diff --git a/.github/actions/nm-hf-cache/action.yml b/.github/actions/nm-hf-cache/action.yml deleted file mode 100644 index 62f54703c9e65..0000000000000 --- a/.github/actions/nm-hf-cache/action.yml +++ /dev/null @@ -1,13 +0,0 @@ -name: HF cache -description: 'mount HF cache' -inputs: - fs_cache: - description: '(deprecated) filesystem to use for HF cache' - required: true -runs: - using: composite - steps: - - run: | - sudo mkdir -m 777 -p ${HF_HOME} - sudo chown -R $(whoami):$(whoami) ${HF_HOME} - shell: bash diff --git a/.github/actions/nm-install-whl/action.yml b/.github/actions/nm-install-whl/action.yml index e229d0bc95d2b..95d6722bb658f 100644 --- a/.github/actions/nm-install-whl/action.yml +++ b/.github/actions/nm-install-whl/action.yml @@ -22,8 +22,7 @@ runs: source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate fi pip3 install -r requirements-dev.txt - BASE=$(./.github/scripts/convert-version ${{ inputs.python }}) - WHL=$(find . -type f -iname "*${BASE}*.whl") + WHL=$(find . -type f -iname "nm_vllm*.whl") WHL_BASENAME=$(basename ${WHL}) echo "whl=${WHL_BASENAME}" >> "$GITHUB_OUTPUT" pip3 install ${WHL}[sparse] --extra-index-url https://pypi.neuralmagic.com/simple diff --git a/.github/actions/nm-set-env/action.yml b/.github/actions/nm-set-env/action.yml index dd8ef9e99d61c..b262bda10d44e 100644 --- a/.github/actions/nm-set-env/action.yml +++ b/.github/actions/nm-set-env/action.yml @@ -28,6 +28,8 @@ runs: # HF Cache echo "HF_TOKEN=${HF_TOKEN_SECRET}" >> $GITHUB_ENV echo "HF_HOME=/model-cache" >> $GITHUB_ENV + # Python cache + echo "PIP_CACHE_DIR=/model-cache/python-cache" >> $GITHUB_ENV # build NUM_THREADS=$(./.github/scripts/determine-threading -G ${{ inputs.Gi_per_thread }}) echo "MAX_JOBS=${NUM_THREADS}" >> $GITHUB_ENV diff --git a/.github/actions/nm-summary-build/action.yml b/.github/actions/nm-summary-build/action.yml index a890f5ad016f2..6f0eb3b457a82 100644 --- a/.github/actions/nm-summary-build/action.yml +++ b/.github/actions/nm-summary-build/action.yml @@ -13,9 +13,6 @@ inputs: python: description: 'python version info' required: true - build_status: - description: 'status from build step' - required: true whl_status: description: 'status from build step' required: true @@ -37,6 +34,5 @@ runs: echo "| gitref: | '${{ inputs.gitref }}' |" >> $GITHUB_STEP_SUMMARY echo "| branch name: | '${{ github.ref_name }}' |" >> $GITHUB_STEP_SUMMARY echo "| python: | ${{ inputs.python }} |" >> $GITHUB_STEP_SUMMARY - echo "| build: | ${BUILD_EMOJI} |" >> $GITHUB_STEP_SUMMARY echo "| whl: | ${WHL_EMOJI} |" >> $GITHUB_STEP_SUMMARY shell: bash diff --git a/.github/actions/nm-test-whl/action.yml b/.github/actions/nm-test-whl/action.yml index 53ca57598f8f8..557374fa11b08 100644 --- a/.github/actions/nm-test-whl/action.yml +++ b/.github/actions/nm-test-whl/action.yml @@ -19,13 +19,15 @@ runs: steps: - id: test_whl run: | + sudo mkdir -m 777 -p /usr/local/apps + sudo chown -R $(whoami):$(whoami) /usr/local/apps pip install coverage pip install pytest-cov pip install pytest-xdist pip install -r requirements-dev.txt SUCCESS=0 VLLM_SRC=$(python3 -c "import vllm; print(vllm.__path__[0])") - ./.github/scripts/run-tests -s ${VLLM_SRC} -t ${{ inputs.test_directory }} -r ${{ inputs.test_results }} -f ${{ inputs.test_skip_list }}|| SUCCESS=$? + ./.github/scripts/run-tests -s ${VLLM_SRC} -t ${{ inputs.test_directory }} -r ${{ inputs.test_results }} || SUCCESS=$? pytest ./neuralmagic/tests/test_nm-vllm_licenses.py --junitxml=${{ inputs.test_results }}/test_nm-vllm_licenses.xml echo "status=${SUCCESS}" >> "$GITHUB_OUTPUT" exit ${SUCCESS} diff --git a/.github/scripts/run-tests b/.github/scripts/run-tests index bea7fffd7a93a..e2e12772d0816 100755 --- a/.github/scripts/run-tests +++ b/.github/scripts/run-tests @@ -8,7 +8,6 @@ usage() { echo " -s - src directory, i.e. location of package *.py files." echo " -t - test directory, i.e. location of *.py test files. (default 'tests/')" echo " -r - desired results base directory. xml results will mirror provided tests directory structure. (default 'test-results/')" - echo " -f - file with test skip list, e.g. ' neuralmagic/tests/skip-for-remote-push.txt'. (default is to run all found tests)" echo " -h - this list of options" echo echo "note: all paths are relative to 'nm-vllm' root" @@ -35,9 +34,6 @@ while getopts "hs:t:r:f:" OPT; do r) RESULTS_DIR="${OPTARG}" ;; - f) - SKIP_LIST="${OPTARG}" - ;; esac done @@ -71,36 +67,6 @@ for FOUND in "${TESTS_FOUND[@]}"; do echo "${FOUND}" done -# build the skip list from provided file -declare -a TESTS_TO_EXCLUDE -if [ -f "${SKIP_LIST}" ]; then - while IFS= read -r line - do - TESTS_TO_EXCLUDE+=("${line}") - done < "${SKIP_LIST}" -fi - -echo "..." -for EXCLUDE in "${TESTS_TO_EXCLUDE[@]}"; do - for JJ in "${!TESTS_FOUND[@]}"; do - if [[ ${TESTS_FOUND[$JJ]} = ${EXCLUDE} ]]; then - echo "excluding: ${EXCLUDE}" - unset 'TESTS_FOUND[$JJ]' - fi - done -done - -echo "..." -echo "planning to run:" -for TEST in "${TESTS_FOUND[@]}" -do - echo "${TEST}" -done -echo "..." - -# download required artifacts for testing -# (cd ${TEST_DIR} && sudo bash ../.buildkite/download-images.sh) - # run selected tests SUCCESS=0 CC_PYTEST_FLAGS="--cov=${SRC_DIR} --cov=${TEST_DIR} --cov-report=html:cc-vllm-html --cov-append" diff --git a/.github/workflows/nm-benchmark.yml b/.github/workflows/nm-benchmark.yml index bac8133fd5c97..9ab70c84a357c 100644 --- a/.github/workflows/nm-benchmark.yml +++ b/.github/workflows/nm-benchmark.yml @@ -29,7 +29,7 @@ on: required: true push_benchmark_results_to_gh_pages: description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI" - type: string + type: boolean required: true # makes workflow manually callable @@ -61,11 +61,8 @@ on: required: true push_benchmark_results_to_gh_pages: description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI" - type: choice - options: - - 'true' - - 'false' - default: 'false' + type: boolean + default: false env: BENCHMARK_RESULTS: /model-cache/benchmark_results @@ -109,11 +106,9 @@ jobs: Gi_per_thread: 1 nvcc_threads: 0 - - name: hf cache - id: hf_cache - uses: ./.github/actions/nm-hf-cache/ - with: - fs_cache: ${{ secrets.HF_FS_CACHE }} + - name: caches + id: caches + uses: ./.github/actions/nm-caches/ - name: download whl id: download diff --git a/.github/workflows/nm-build-test.yml b/.github/workflows/nm-build-test.yml index 1a037a837ed58..e85dd2b199745 100644 --- a/.github/workflows/nm-build-test.yml +++ b/.github/workflows/nm-build-test.yml @@ -1,27 +1,29 @@ name: nm build-test on: + # makes workflow reusable workflow_call: inputs: wf_category: - description: "categories: REMOTE, NIGHTLY, WEEKLY, RELEASE" + description: "workflow category: REMOTE, NIGHTLY, RELEASE" type: string default: "REMOTE" - push_binaries_to_pypi: + push_to_pypi: description: "When set to true, built wheels and tar.gz will be pushed to neuralmagic pypi if all tests pass" - type: string - default: 'no' + type: boolean + default: false python: description: "python version, e.g. 3.10.12" type: string required: true + # build related parameters build_label: description: "requested runner label (specifies instance)" type: string default: gcp-k8s-build build_timeout: - description: "time limit for build in minutes " + description: "time limit for build in minutes" type: string default: "120" Gi_per_thread: @@ -32,27 +34,29 @@ on: description: "number of threads nvcc build threads" type: string default: "8" + # test related parameters - test_label_solo: - description: "requested runner label (specifies instance)" - type: string - required: true - test_label_multi: - description: "requested runner label (specifies instance)" + + # stringified Json array of maps + # each map has a "python", "gha label", "test skip env vars" e.g. + # [ + # {'python':'3.8.17','label':'gcp-k8s-l4-solo','test':'neuralmagic/tests/test_skip_env_vars/smoke.txt'}, + # ... + # ] + test_configs: + description: "python, label, skip envs" type: string required: true + test_timeout: - description: "time limit for test run in minutes " + description: "time limit for test run in minutes" type: string required: true gitref: description: "git commit hash or branch name" type: string required: true - test_skip_env_vars: - description: 'file with list of env vars controlling which tests to run' - type: string - required: true + # benchmark related parameters benchmark_label: description: "requested benchmark label (specifies instance)" @@ -67,9 +71,10 @@ on: type: string default: "720" push_benchmark_results_to_gh_pages: - description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI" - type: string - default: "false" + description: "when set to true, the workflow pushes all benchmarking results to gh-pages UI" + type: boolean + default: false + # lm-eval related parameters lm_eval_label: description: "requested runner label (specifies instance)" @@ -82,11 +87,24 @@ on: lm_eval_configuration: description: "configuration for lm-eval test (see neuralmagic/lm-eval)" type: string - default: "" + default: "" jobs: + JSON-VALIDATE: + runs-on: gcp-k8s-util + strategy: + matrix: + test_config: ${{ fromJson(inputs.test_configs) }} + steps: + - name: validate test config + run: | + echo "python: ${{ matrix.test_config.python }}" + echo "label: ${{ matrix.test_config.label }}" + echo "tests: ${{ matrix.test_config.test }}" + BUILD: + needs: [JSON-VALIDATE] uses: ./.github/workflows/nm-build.yml with: wf_category: ${{ inputs.wf_category }} @@ -98,33 +116,23 @@ jobs: python: ${{ inputs.python }} secrets: inherit - TEST-SOLO: + TEST: needs: [BUILD] if: success() + strategy: + fail-fast: false + matrix: + test_config: ${{ fromJson(inputs.test_configs) }} uses: ./.github/workflows/nm-test.yml with: - test_label: ${{ inputs.test_label_solo }} + test_label: ${{ matrix.test_config.label }} timeout: ${{ inputs.test_timeout }} gitref: ${{ github.ref }} - python: ${{ inputs.python }} - whl: ${{ needs.BUILD.outputs.whl }} - test_skip_env_vars: ${{ inputs.test_skip_env_vars }} + python: ${{ matrix.test_config.python }} + whl: ${{ needs.BUILD.output.whl }} + test_skip_env_vars: ${{ matrix.test_config.test }} secrets: inherit - # TODO: re-enable - # TEST-MULTI: - # needs: [BUILD] - # if: success() && contains(fromJSON('["NIGHTLY", "WEEKLY", "RELEASE"]'), inputs.wf_category) - # uses: ./.github/workflows/nm-test.yml - # with: - # test_label: ${{ inputs.test_label_multi }} - # timeout: ${{ inputs.test_timeout }} - # gitref: ${{ github.ref }} - # python: ${{ inputs.python }} - # whl: ${{ needs.BUILD.outputs.whl }} - # test_skip_env_vars: ${{ inputs.test_skip_env_vars }} - # secrets: inherit - BENCHMARK: needs: [BUILD] if: success() @@ -140,7 +148,7 @@ jobs: push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}" secrets: inherit - LM-EVAL-SOLO: + LM-EVAL: needs: [BUILD] uses: ./.github/workflows/nm-lm-eval.yml with: @@ -152,12 +160,13 @@ jobs: lm_eval_configuration: ${{ inputs.lm_eval_configuration }} secrets: inherit + # uploading is only available when using GCP autoscaling group UPLOAD: - needs: [TEST-SOLO, BENCHMARK, LM-EVAL-SOLO] - if: ${{ contains(fromJSON('["NIGHTLY", "WEEKLY", "RELEASE"]'), inputs.wf_category) && inputs.push_binaries_to_pypi == 'yes' }} + needs: [TEST, BENCHMARK, LM-EVAL] + if: ${{ inputs.push_to_pypi }} uses: ./.github/workflows/nm-upload-assets-to-gcp.yml with: - label: ${{ inputs.build_label }} + label: gcp-k8s-util timeout: ${{ inputs.build_timeout }} gitref: ${{ github.ref }} python: ${{ inputs.python }} diff --git a/.github/workflows/nm-build.yml b/.github/workflows/nm-build.yml index 5ac43dd8db1bc..d8672a5118971 100644 --- a/.github/workflows/nm-build.yml +++ b/.github/workflows/nm-build.yml @@ -73,7 +73,13 @@ jobs: runs-on: ${{ inputs.build_label }} timeout-minutes: ${{ fromJson(inputs.timeout) }} + + permissions: + contents: 'read' + id-token: 'write' + outputs: + run_id: ${{ github.run_id }} whl: ${{ steps.build.outputs.whl }} tarfile: ${{ steps.build.outputs.tarfile }} @@ -118,7 +124,27 @@ jobs: with: python: ${{ inputs.python }} venv: ${{ env.VENV_BASE }} - pypi: ${{ secrets.NM_PRIVATE_PYPI_LOCATION }} + + # GCP + - name: 'Authenticate to Google Cloud' + id: auth + uses: google-github-actions/auth@v2.1.3 + with: + project_id: ${{ secrets.GCP_PROJECT }} + workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ secrets.NM_PYPI_SA }} + + - name: 'Set up Cloud SDK' + uses: 'google-github-actions/setup-gcloud@v2' + with: + version: '>= 473.0.0' + + - name: copy whl and source distribution + run: | + # echo "whl: ${{ steps.build.outputs.whl }}" + # echo "tarfile: ${{ steps.build.outputs.tarfile }}" + gcloud storage cp dist/${{ steps.build.outputs.whl }} gs://neuralmagic-public-pypi/assets/${{ github.run_id }}/${{ steps.build.outputs.whl }} + gcloud storage cp dist/${{ steps.build.outputs.tarfile }} gs://neuralmagic-public-pypi/assets/${{ github.run_id }}/${{ steps.build.outputs.tarfile }} - name: upload whl uses: actions/upload-artifact@v4 @@ -126,7 +152,7 @@ jobs: with: name: ${{ steps.build.outputs.whl }} path: dist/${{ steps.build.outputs.whl }} - retention-days: 15 + retention-days: 5 - name: upload tar.gz uses: actions/upload-artifact@v4 @@ -144,7 +170,6 @@ jobs: gitref: ${{ inputs.gitref }} testmo_run_url: https://neuralmagic.testmo.net/automation/runs/view/${{ steps.create_testmo_run.outputs.id }} python: ${{ steps.set_python.outputs.version }} - build_status: ${{ steps.build.outputs.build_status }} whl_status: ${{ steps.build.outputs.whl_status }} - name: run status @@ -154,9 +179,7 @@ jobs: BUILD_STATUS: ${{ steps.build.outputs.build_status }} WHL_STATUS: ${{ steps.build.outputs.whl_status }} run: | - echo "build status: ${BUILD_STATUS}" echo "build status: ${WHL_STATUS}" - if [ -z "${BUILD_STATUS}" ] || [ "${BUILD_STATUS}" -ne "0" ]; then exit 1; fi if [ -z "${WHL_STATUS}" ] || [ "${WHL_STATUS}" -ne "0" ]; then exit 1; fi - name: complete testmo run diff --git a/.github/workflows/nm-lm-eval.yml b/.github/workflows/nm-lm-eval.yml index 90b7ec61a0a3a..4ffeb76ef9f15 100644 --- a/.github/workflows/nm-lm-eval.yml +++ b/.github/workflows/nm-lm-eval.yml @@ -91,11 +91,9 @@ jobs: Gi_per_thread: ${{ inputs.Gi_per_thread }} nvcc_threads: ${{ inputs.nvcc_threads }} - - name: hf cache - id: hf_cache - uses: ./.github/actions/nm-hf-cache/ - with: - fs_cache: ${{ secrets.HF_FS_CACHE }} + - name: caches + id: caches + uses: ./.github/actions/nm-caches/ - name: download whl id: download diff --git a/.github/workflows/nm-nightly.yml b/.github/workflows/nm-nightly.yml index 7de93e4b5ae40..89b2e6ea3074e 100644 --- a/.github/workflows/nm-nightly.yml +++ b/.github/workflows/nm-nightly.yml @@ -3,110 +3,45 @@ run-name: ${{ github.actor }} triggered nightly on ${{ github.ref }} on: schedule: # * is a special character in YAML so you have to quote this string - - cron: '0 1 * * 1-6' # nightly run (Mon-Sat) + - cron: '0 1 * * *' # nightly run workflow_dispatch: inputs: - push_benchmark_results_to_gh_pages: - description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI " + wf_category: + description: "workflow category, default is NIGHTLY" type: choice options: - - 'true' - - 'false' - default: 'false' - push_binaries_to_pypi: - description: "When set to yes, built wheels and tar.gz will be pushed to neuralmagic pypi if all tests pass" - type: string - default: 'yes' + - NIGHTLY + - RELEASE + default: NIGHTLY + push_to_pypi: + description: "when set and tests pass, then '.whl' and '.tar.gz' will be pushed to neuralmagic pypi" + type: boolean + default: false + push_benchmark_results_to_gh_pages: + description: "when set, then all benchmarking results are published to gh-pages UI " + type: boolean + default: false jobs: - PYTHON-3-8: - uses: ./.github/workflows/nm-build-test.yml - with: - wf_category: NIGHTLY - python: 3.8.17 - gitref: ${{ github.ref }} - push_binaries_to_pypi: ${{ inputs.push_binaries_to_pypi }} - - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore - test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt - - benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt - benchmark_timeout: 720 - push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}" - - lm_eval_label: gcp-k8s-l4-solo - lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml - lm_eval_timeout: 60 - secrets: inherit - - PYTHON-3-9: - uses: ./.github/workflows/nm-build-test.yml - with: - wf_category: NIGHTLY - python: 3.9.17 - gitref: ${{ github.ref }} - push_binaries_to_pypi: ${{ inputs.push_binaries_to_pypi }} - - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore - test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt - - benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt - benchmark_timeout: 720 - push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}" - - lm_eval_label: gcp-k8s-l4-solo - lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml - lm_eval_timeout: 60 - secrets: inherit - - PYTHON-3-10: + NIGHTLY: uses: ./.github/workflows/nm-build-test.yml with: - wf_category: NIGHTLY + wf_category: ${{ inputs.wf_category || 'NIGHTLY' }} python: 3.10.12 gitref: ${{ github.ref }} - push_binaries_to_pypi: ${{ inputs.push_binaries_to_pypi }} - - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore - test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt - - benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt - benchmark_timeout: 720 - push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}" - - lm_eval_label: gcp-k8s-l4-solo - lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml - lm_eval_timeout: 60 - secrets: inherit - - PYTHON-3-11: - uses: ./.github/workflows/nm-build-test.yml - with: - wf_category: NIGHTLY - python: 3.11.4 - gitref: ${{ github.ref }} - push_binaries_to_pypi: ${{ inputs.push_binaries_to_pypi }} + push_to_pypi: ${{ inputs.push_to_pypi }} - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore + test_configs: '[{"python":"3.8.17","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/full.txt"}, + {"python":"3.9.17","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/full.txt"}, + {"python":"3.10.12","label":"gcp-k8s-l4-duo","test":"neuralmagic/tests/test_skip_env_vars/full.txt"}, + {"python":"3.11.4","label":"gcp-k8s-l4-duo","test":"neuralmagic/tests/test_skip_env_vars/full.txt"}]' test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt - benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt - benchmark_timeout: 720 + benchmark_timeout: 480 push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}" lm_eval_label: gcp-k8s-l4-solo diff --git a/.github/workflows/nm-release.yml b/.github/workflows/nm-release.yml deleted file mode 100644 index 76040df023c8e..0000000000000 --- a/.github/workflows/nm-release.yml +++ /dev/null @@ -1,110 +0,0 @@ -name: nm release -run-name: ${{ github.actor }} verifying branch '${{ github.ref }}' -on: - workflow_dispatch: - inputs: - push_benchmark_results_to_gh_pages: - description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI" - type: choice - options: - - 'true' - - 'false' - default: 'false' - push_binaries_to_pypi: - description: "When set to yes, built wheels and tar.gz will be pushed to neuralmagic pypi if all tests pass" - type: string - default: 'no' - -jobs: - - PYTHON-3-8: - uses: ./.github/workflows/nm-build-test.yml - with: - wf_category: 'RELEASE' - python: 3.8.17 - gitref: ${{ github.ref }} - push_binaries_to_pypi: ${{ inputs.push_binaries_to_pypi }} - - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore - test_timeout: 720 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt - - benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt - benchmark_timeout: 720 - push_benchmark_results_to_gh_pages: ${{ inputs.push_benchmark_results_to_gh_pages }} - - lm_eval_label: gcp-k8s-l4-solo - lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml - lm_eval_timeout: 60 - secrets: inherit - - PYTHON-3-9: - uses: ./.github/workflows/nm-build-test.yml - with: - wf_category: 'RELEASE' - python: 3.9.17 - gitref: ${{ github.ref }} - push_binaries_to_pypi: ${{ inputs.push_binaries_to_pypi }} - - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore - test_timeout: 720 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt - - benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt - benchmark_timeout: 720 - push_benchmark_results_to_gh_pages: ${{ inputs.push_benchmark_results_to_gh_pages }} - - lm_eval_label: gcp-k8s-l4-solo - lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml - lm_eval_timeout: 60 - secrets: inherit - - PYTHON-3-10: - uses: ./.github/workflows/nm-build-test.yml - with: - wf_category: 'RELEASE' - python: 3.10.12 - gitref: ${{ github.ref }} - push_binaries_to_pypi: ${{ inputs.push_binaries_to_pypi }} - - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore - test_timeout: 720 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt - - benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt - benchmark_timeout: 720 - push_benchmark_results_to_gh_pages: ${{ inputs.push_benchmark_results_to_gh_pages }} - - lm_eval_label: gcp-k8s-l4-solo - lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml - lm_eval_timeout: 60 - secrets: inherit - - PYTHON-3-11: - uses: ./.github/workflows/nm-build-test.yml - with: - wf_category: 'RELEASE' - python: 3.11.4 - gitref: ${{ github.ref }} - push_binaries_to_pypi: ${{ inputs.push_binaries_to_pypi }} - - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore - test_timeout: 720 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt - - benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt - benchmark_timeout: 720 - push_benchmark_results_to_gh_pages: ${{ inputs.push_benchmark_results_to_gh_pages }} - - lm_eval_label: gcp-k8s-l4-solo - lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml - lm_eval_timeout: 60 - secrets: inherit diff --git a/.github/workflows/nm-remote-push.yml b/.github/workflows/nm-remote-push.yml index 4a9171fcdcc0e..b23a2f9389512 100644 --- a/.github/workflows/nm-remote-push.yml +++ b/.github/workflows/nm-remote-push.yml @@ -12,80 +12,18 @@ concurrency: jobs: - BUILD-TEST-3-8: - uses: ./.github/workflows/nm-build-test.yml - with: - python: 3.8.17 - gitref: ${{ github.ref }} - push_binaries_to_pypi: 'no' - - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore - test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt - - benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt - benchmark_timeout: 480 - - lm_eval_label: gcp-k8s-l4-solo - lm_eval_configuration: ./neuralmagic/lm-eval/smoke-small-models.yaml - lm_eval_timeout: 60 - secrets: inherit - - BUILD-TEST-3-9: - uses: ./.github/workflows/nm-build-test.yml - with: - python: 3.9.17 - gitref: ${{ github.ref }} - push_binaries_to_pypi: 'no' - - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore - test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt - - benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt - benchmark_timeout: 480 - - lm_eval_label: gcp-k8s-l4-solo - lm_eval_configuration: ./neuralmagic/lm-eval/smoke-small-models.yaml - lm_eval_timeout: 60 - secrets: inherit - - BUILD-TEST-3-10: + REMOTE: uses: ./.github/workflows/nm-build-test.yml with: python: 3.10.12 gitref: ${{ github.ref }} - push_binaries_to_pypi: 'no' - - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore - test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt - - benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt - benchmark_timeout: 480 - - lm_eval_label: gcp-k8s-l4-solo - lm_eval_configuration: ./neuralmagic/lm-eval/smoke-small-models.yaml - lm_eval_timeout: 60 - secrets: inherit - - BUILD-TEST-3-11: - uses: ./.github/workflows/nm-build-test.yml - with: - python: 3.11.4 - gitref: ${{ github.ref }} - push_binaries_to_pypi: 'no' + push_to_pypi: false - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore + test_configs: '[{"python":"3.8.17","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/smoke.txt"}, + {"python":"3.9.17","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/smoke.txt"}, + {"python":"3.10.12","label":"gcp-k8s-l4-duo","test":"neuralmagic/tests/test_skip_env_vars/smoke.txt"}, + {"python":"3.11.4","label":"gcp-k8s-l4-duo","test":"neuralmagic/tests/test_skip_env_vars/smoke.txt"}]' test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt benchmark_label: gcp-k8s-l4-solo benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt diff --git a/.github/workflows/nm-test.yml b/.github/workflows/nm-test.yml index a8b45ccbe61fe..01d6fa96730d3 100644 --- a/.github/workflows/nm-test.yml +++ b/.github/workflows/nm-test.yml @@ -4,7 +4,7 @@ on: workflow_call: inputs: test_label: - description: "requested runner label (specifies instance)" + description: "requested runner label" type: string required: true timeout: @@ -32,7 +32,7 @@ on: workflow_dispatch: inputs: test_label: - description: "requested runner label (specifies instance)" + description: "requested runner label" type: string required: true timeout: @@ -109,11 +109,9 @@ jobs: id: verify_python uses: ./.github/actions/nm-verify-python/ - - name: hf cache - id: hf_cache - uses: ./.github/actions/nm-hf-cache/ - with: - fs_cache: ${{ secrets.HF_FS_CACHE }} + - name: caches + id: caches + uses: ./.github/actions/nm-caches/ - name: download whl id: download diff --git a/.github/workflows/nm-upload-assets-to-gcp.yml b/.github/workflows/nm-upload-assets-to-gcp.yml index bfade2a90d2f9..5db9be07b8ff4 100644 --- a/.github/workflows/nm-upload-assets-to-gcp.yml +++ b/.github/workflows/nm-upload-assets-to-gcp.yml @@ -33,6 +33,12 @@ jobs: steps: + - name: install automation components + run: | + sudo apt-get update --fix-missing + sudo apt-get install -y git-all + sudo apt-get install -y curl + - name: checkout id: checkout uses: actions/checkout@v4 diff --git a/.github/workflows/nm-weekly.yml b/.github/workflows/nm-weekly.yml deleted file mode 100644 index 4f3ce2c4f9f79..0000000000000 --- a/.github/workflows/nm-weekly.yml +++ /dev/null @@ -1,41 +0,0 @@ -name: nm Weekly -run-name: ${{ github.actor }} triggered weekly on ${{ github.ref }} -on: - schedule: - # * is a special character in YAML so you have to quote this string - - cron: '0 1 * * 0' # weekly run (Sun) - - workflow_dispatch: - inputs: - push_benchmark_results_to_gh_pages: - description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI" - type: choice - options: - - 'true' - - 'false' - default: 'false' - -jobs: - - BUILD-TEST: - uses: ./.github/workflows/nm-build-test.yml - with: - wf_category: WEEKLY - python: 3.10.12 - gitref: ${{ github.ref }} - push_binaries_to_pypi: 'no' - - test_label_solo: aws-avx2-32G-a10g-24G - test_label_multi: aws-avx2-192G-4-a10g-96G - test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt - - benchmark_label: aws-avx2-32G-a10g-24G - benchmark_config_list_file: ./.github/data/nm_benchmark_weekly_configs_list.txt - benchmark_timeout: 720 - push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}" - - lm_eval_label: gcp-k8s-l4-solo - lm_eval_configuration: ./neuralmagic/lm-eval/full-samll-models.yaml - lm_eval_timeout: 60 - secrets: inherit diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py index 8a68dceac136c..c6453f6a93a6a 100644 --- a/tests/engine/test_stop_strings.py +++ b/tests/engine/test_stop_strings.py @@ -1,6 +1,7 @@ from typing import Any, List, Optional import pytest +import torch from tests.nm_utils.utils_skip import should_skip_test_group from vllm import CompletionOutput, LLMEngine, SamplingParams @@ -19,6 +20,8 @@ def vllm_model(vllm_runner): yield vllm_model +@pytest.mark.skipif(torch.cuda.get_device_capability() < (8, 0), + reason="T4 KV cache constraints") @pytest.mark.skip_global_cleanup def test_stop_basic(vllm_model): _test_stopping(vllm_model.model.llm_engine, @@ -34,6 +37,8 @@ def test_stop_basic(vllm_model): expected_reason=".") +@pytest.mark.skipif(torch.cuda.get_device_capability() < (8, 0), + reason="T4 KV cache constraints") @pytest.mark.skip_global_cleanup def test_stop_multi_tokens(vllm_model): _test_stopping( @@ -52,6 +57,8 @@ def test_stop_multi_tokens(vllm_model): expected_reason="group of peo") +@pytest.mark.skipif(torch.cuda.get_device_capability() < (8, 0), + reason="T4 KV cache constraints") @pytest.mark.skip_global_cleanup def test_stop_partial_token(vllm_model): _test_stopping(vllm_model.model.llm_engine, @@ -67,6 +74,8 @@ def test_stop_partial_token(vllm_model): expected_reason="gani") +@pytest.mark.skipif(torch.cuda.get_device_capability() < (8, 0), + reason="T4 KV cache constraints") @pytest.mark.skip_global_cleanup def test_stop_token_id(vllm_model): # token id 13013 => " organization" diff --git a/tests/entrypoints/test_llm_generate_multiple_loras.py b/tests/entrypoints/test_llm_generate_multiple_loras.py index 8401e9836f1ac..110f9c2aafafd 100644 --- a/tests/entrypoints/test_llm_generate_multiple_loras.py +++ b/tests/entrypoints/test_llm_generate_multiple_loras.py @@ -1,6 +1,7 @@ import weakref import pytest +import torch # downloading lora to test lora requests from huggingface_hub import snapshot_download @@ -54,6 +55,11 @@ def zephyr_lora_files(): return snapshot_download(repo_id=LORA_NAME) +@pytest.mark.skipif( + torch.cuda.get_device_capability() < (8, 0), + reason= + "Bfloat16 is only supported on GPUs with compute capability of at least 8.0" +) @pytest.mark.skip_global_cleanup def test_multiple_lora_requests(llm: LLM, zephyr_lora_files): lora_request = [ diff --git a/tests/models_core/test_llm_logprobs.py b/tests/models_core/test_llm_logprobs.py index be776637c87f6..65186aafabf48 100644 --- a/tests/models_core/test_llm_logprobs.py +++ b/tests/models_core/test_llm_logprobs.py @@ -6,6 +6,7 @@ Run `pytest tests/models/test_models_logprobs.py`. """ import pytest +import torch from tests.models.utils import check_logprobs_close from tests.nm_utils.utils_skip import should_skip_test_group @@ -24,6 +25,8 @@ ] +@pytest.mark.skipif(torch.cuda.get_device_capability() < (8, 0), + reason="T4 memory constraints") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [5]) diff --git a/tests/models_core/test_magic_wand.py b/tests/models_core/test_magic_wand.py index f4f4831f4f1da..f31fb1885506b 100644 --- a/tests/models_core/test_magic_wand.py +++ b/tests/models_core/test_magic_wand.py @@ -1,6 +1,6 @@ """Compare the outputs of a sparse model vs sparse model running dense. -Note: sparse kernels do not have bitwise correctness vs the dense models. -As a result, in this test, we just confirm that the top selected tokens of the +Note: sparse kernels do not have bitwise correctness vs the dense models. +As a result, in this test, we just confirm that the top selected tokens of the sparse models are in the top N selections of same model running dense. Run `pytest tests/models_core/test_magic_wand.py`. @@ -9,6 +9,7 @@ import gc import pytest +import torch from tests.models.utils import check_logprobs_close from tests.nm_utils.utils_skip import should_skip_test_group @@ -25,6 +26,8 @@ ] +@pytest.mark.skipif(torch.cuda.get_device_capability() < (8, 0), + reason="skip for T4s, requires compute capability 8.0") @pytest.mark.parametrize("model_format_extrablocks", MODEL_FORMAT_EXTRABLOCKS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) diff --git a/tests/models_core/test_server_logprobs.py b/tests/models_core/test_server_logprobs.py index 48a4d7bece18b..023d63e3b93ae 100644 --- a/tests/models_core/test_server_logprobs.py +++ b/tests/models_core/test_server_logprobs.py @@ -67,6 +67,8 @@ async def my_chat( top_logprobs=num_logprobs) +@pytest.mark.skipif(torch.cuda.get_device_capability() < (8, 0), + reason="T4 memory constraints") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [5]) diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py index 1e2dc9197b403..4c28f859e793e 100644 --- a/tests/prefix_caching/test_disable_sliding_window.py +++ b/tests/prefix_caching/test_disable_sliding_window.py @@ -3,6 +3,7 @@ Run `pytest tests/prefix_caching/test_prefix_caching.py`. """ import pytest +import torch from tests.conftest import cleanup from tests.nm_utils.utils_skip import should_skip_test_group @@ -26,6 +27,11 @@ ] +@pytest.mark.skipif( + torch.cuda.get_device_capability() < (8, 0), + reason= + "Bfloat16 is only supported on GPUs with compute capability of at least 8.0" +) @pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN) def test_disable_sliding_window(model_len_len, ): model, sliding_len, full_len = model_len_len diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py index 3b7dcbc5983fc..db35679b03acb 100644 --- a/tests/quantization/test_configs.py +++ b/tests/quantization/test_configs.py @@ -6,6 +6,7 @@ from dataclasses import dataclass import pytest +import torch from tests.nm_utils.utils_skip import should_skip_test_group from vllm.config import ModelConfig @@ -55,6 +56,8 @@ class ModelPair: ] +@pytest.mark.skipif(torch.cuda.get_device_capability() < (8, 0), + reason="skip for T4s, requires compute capability 8.0") @pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES) def test_auto_gptq(model_arg_exptype: str) -> None: model_path, quantization_arg, expected_type = model_arg_exptype diff --git a/tests/test_regression.py b/tests/test_regression.py index 5d27d35793017..46298d6dc16fa 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -6,6 +6,7 @@ """ import gc +import pytest import torch from vllm import LLM, SamplingParams @@ -53,6 +54,11 @@ def test_gc(): assert allocated < 50 * 1024 * 1024 +@pytest.mark.skipif( + torch.cuda.get_device_capability() < (8, 0), + reason= + "Bfloat16 is only supported on GPUs with compute capability of at least 8.0" +) def test_model_from_modelscope(monkeypatch): # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary MODELSCOPE_MODEL_NAME = "qwen/Qwen1.5-0.5B-Chat"