From 250f6d88e6fbb6b1179f681bef2bbdb7095d3a55 Mon Sep 17 00:00:00 2001
From: dhuangnm <74931910+dhuangnm@users.noreply.github.com>
Date: Mon, 24 Jun 2024 15:02:30 -0400
Subject: [PATCH 1/2] bump version to 0.5.1 (#330)

Co-authored-by: dhuangnm <dhuang@MacBook-Pro-2.local>
---
 vllm/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/version.py b/vllm/version.py
index db951e35dac97..368411c8059c8 100644
--- a/vllm/version.py
+++ b/vllm/version.py
@@ -1,2 +1,2 @@
 # UPSTREAM SYNC: take downstream
-__version__ = "0.5.0"
+__version__ = "0.5.1"

From 05c3004489c0c0501dba5fab72b254e3014f1ae5 Mon Sep 17 00:00:00 2001
From: Andy Linfoot <78757007+andy-neuma@users.noreply.github.com>
Date: Mon, 24 Jun 2024 21:10:12 -0400
Subject: [PATCH 2/2] cross python whl (#315)

SUMMARY:
* transition workflows to one cross python whl. minimum supported python
is 3.8.
* adjustments to actions resulting from using one whl
* adjustments to workflows resulting from using one whl
* updating parameters to enable passing in an "array" of test labels,
python versions, and tests to run.
* upload whl to GCP storage bucket organized by run id. this will be
used in a later PR to allow us to install and run from GCP directly.
* deletion of WEEKLY and RELEASE workflows. these can now be handled via
the NIGHTLY by just specifying different input parameter.
* removal of "WEEKLY" workflow category. we should just push this
semantic into "RELEASE".
* add "pytest skip if" for some tests to avoid running them on T4's
(<8.0 compute capability)
* "remote push" will now run on "l4 solo" and "l4 duo". there are still
some issues with the T4's, so moving all jobs to run on L4's. we should
address the T4 issues in a separate PR.
* adding "python cache" semantics. this roughly cuts build times in
half. building the whl now takes about 25 minutes. this is inline with
the times when we were using GCP static runners.
* clean up "publish binaries" parameter. moved it to a boolean and set
defaults to `false`. this makes everything cleaner and also cleans up
the GHA UI when manually triggering workflows.
* remove reference to skip list in "run tests"

TEST PLAN:
runs on remote push

---------

Co-authored-by: andy-neuma <andy@neuralmagic.com>
---
 .github/actions/nm-build-vllm/action.yml      |  37 +-----
 .github/actions/nm-caches/action.yml          |  11 ++
 .github/actions/nm-hf-cache/action.yml        |  13 ---
 .github/actions/nm-install-whl/action.yml     |   3 +-
 .github/actions/nm-set-env/action.yml         |   2 +
 .github/actions/nm-summary-build/action.yml   |   4 -
 .github/actions/nm-test-whl/action.yml        |   4 +-
 .github/scripts/run-tests                     |  34 ------
 .github/workflows/nm-benchmark.yml            |  17 +--
 .github/workflows/nm-build-test.yml           |  95 ++++++++-------
 .github/workflows/nm-build.yml                |  33 +++++-
 .github/workflows/nm-lm-eval.yml              |   8 +-
 .github/workflows/nm-nightly.yml              | 109 ++++-------------
 .github/workflows/nm-release.yml              | 110 ------------------
 .github/workflows/nm-remote-push.yml          |  74 +-----------
 .github/workflows/nm-test.yml                 |  12 +-
 .github/workflows/nm-upload-assets-to-gcp.yml |   6 +
 .github/workflows/nm-weekly.yml               |  41 -------
 tests/engine/test_stop_strings.py             |   9 ++
 .../test_llm_generate_multiple_loras.py       |   6 +
 tests/models_core/test_llm_logprobs.py        |   3 +
 tests/models_core/test_magic_wand.py          |   7 +-
 tests/models_core/test_server_logprobs.py     |   2 +
 .../test_disable_sliding_window.py            |   6 +
 tests/quantization/test_configs.py            |   3 +
 tests/test_regression.py                      |   6 +
 26 files changed, 191 insertions(+), 464 deletions(-)
 create mode 100644 .github/actions/nm-caches/action.yml
 delete mode 100644 .github/actions/nm-hf-cache/action.yml
 delete mode 100644 .github/workflows/nm-release.yml
 delete mode 100644 .github/workflows/nm-weekly.yml

diff --git a/.github/actions/nm-build-vllm/action.yml b/.github/actions/nm-build-vllm/action.yml
index c78a9a8b27d65..fc629242248fb 100644
--- a/.github/actions/nm-build-vllm/action.yml
+++ b/.github/actions/nm-build-vllm/action.yml
@@ -7,15 +7,9 @@ inputs:
   venv:
     description: 'name for python virtual environment'
     required: true
-  pypi:
-    description: 'ip address for pypi server'
-    required: true
 outputs:
-  build_status:
-    description: "final status from 'pip install -e'"
-    value: ${{ steps.build.outputs.build_status }}
   whl_status:
-    description: "final status from 'pip3 wheel --no-deps -w dist'"
+    description: "final status from constructing the whl"
     value: ${{ steps.build.outputs.whl_status }}
   whl:
     description: 'basename for generated whl'
@@ -31,35 +25,16 @@ runs:
       COMMIT=${{ github.sha }}
       VENV="${{ inputs.venv }}-${COMMIT:0:7}"
       source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
-      # TODO: adjust when we need a proper release. use nightly now.
       pip3 install -r requirements-cuda.txt -r requirements-build.txt
-      # build
-      SUCCESS=0
-      pip3 install -e . || SUCCESS=$?
-      echo "build_status=${SUCCESS}" >> "$GITHUB_OUTPUT"
-      if [ ${SUCCESS} -ne 0 ]; then
-        exit 1
-      fi
-      # strip binaries
-      if [ ! $(command -v strip) ]; then
-          sudo apt install -y binutils
-      fi
-      if [ ! $(command -v file) ]; then
-          sudo apt install -y file
-      fi
-      for eachso in $(find . -type f -name '*.so')
-      do
-          strip $eachso
-          file $eachso
-      done
       # whl
       SUCCESS=0
-      pip3 wheel --no-deps -w dist . || SUCCESS=$?
+      python setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 || SUCCESS=$?
       echo "whl_status=${SUCCESS}" >> "$GITHUB_OUTPUT"
-      BASE=$(./.github/scripts/convert-version ${{ inputs.python }})
       ls -alh dist
-      WHL_FILEPATH=$(find dist -iname "*${BASE}*.whl")
+      WHL_FILEPATH=$(find dist -type f -iname "*linux_x86_64.whl")
+      echo "whl: ${WHL_FILEPATH}"
       RENAME=$(echo ${WHL_FILEPATH} | sed -e 's/linux_x86_64/manylinux_2_17_x86_64/')
+      echo "rename: ${RENAME}"
       mv ${WHL_FILEPATH} ${RENAME}
       WHL=$(basename ${RENAME})
       echo "whl=${WHL}" >> "$GITHUB_OUTPUT"
@@ -67,7 +42,7 @@ runs:
         exit 1
       fi
       # sdist
-      python3 setup.py sdist || SUCCESS=$?
+      python setup.py sdist || SUCCESS=$?
       pyenv uninstall --force ${{ inputs.python}}/envs/${VENV}
       ls -alh dist
       TAR_FILEPATH=$(find dist -type f -iname "*.tar.gz")
diff --git a/.github/actions/nm-caches/action.yml b/.github/actions/nm-caches/action.yml
new file mode 100644
index 0000000000000..db4db069a7d7a
--- /dev/null
+++ b/.github/actions/nm-caches/action.yml
@@ -0,0 +1,11 @@
+name: set up caches
+description: 'set up HF and Python caches'
+runs:
+  using: composite
+  steps:
+  - run: |
+      sudo mkdir -m 777 -p ${HF_HOME}
+      sudo chown -R $(whoami):$(whoami) ${HF_HOME}
+      sudo mkdir -m 777 -p ${PIP_CACHE_DIR}
+      sudo chown -R $(whoami):$(whoami) ${PIP_CACHE_DIR}
+    shell: bash
diff --git a/.github/actions/nm-hf-cache/action.yml b/.github/actions/nm-hf-cache/action.yml
deleted file mode 100644
index 62f54703c9e65..0000000000000
--- a/.github/actions/nm-hf-cache/action.yml
+++ /dev/null
@@ -1,13 +0,0 @@
-name: HF cache
-description: 'mount HF cache'
-inputs:
-  fs_cache:
-    description: '(deprecated) filesystem to use for HF cache'
-    required: true
-runs:
-  using: composite
-  steps:
-  - run: |
-      sudo mkdir -m 777 -p ${HF_HOME}
-      sudo chown -R $(whoami):$(whoami) ${HF_HOME}
-    shell: bash
diff --git a/.github/actions/nm-install-whl/action.yml b/.github/actions/nm-install-whl/action.yml
index e229d0bc95d2b..95d6722bb658f 100644
--- a/.github/actions/nm-install-whl/action.yml
+++ b/.github/actions/nm-install-whl/action.yml
@@ -22,8 +22,7 @@ runs:
           source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
         fi
         pip3 install -r requirements-dev.txt
-        BASE=$(./.github/scripts/convert-version ${{ inputs.python }})
-        WHL=$(find . -type f -iname "*${BASE}*.whl")
+        WHL=$(find . -type f -iname "nm_vllm*.whl")
         WHL_BASENAME=$(basename ${WHL})
         echo "whl=${WHL_BASENAME}" >> "$GITHUB_OUTPUT"
         pip3 install ${WHL}[sparse] --extra-index-url https://pypi.neuralmagic.com/simple
diff --git a/.github/actions/nm-set-env/action.yml b/.github/actions/nm-set-env/action.yml
index dd8ef9e99d61c..b262bda10d44e 100644
--- a/.github/actions/nm-set-env/action.yml
+++ b/.github/actions/nm-set-env/action.yml
@@ -28,6 +28,8 @@ runs:
       # HF Cache
       echo "HF_TOKEN=${HF_TOKEN_SECRET}" >> $GITHUB_ENV
       echo "HF_HOME=/model-cache" >> $GITHUB_ENV
+      # Python cache
+      echo "PIP_CACHE_DIR=/model-cache/python-cache" >> $GITHUB_ENV
       # build
       NUM_THREADS=$(./.github/scripts/determine-threading -G ${{ inputs.Gi_per_thread }})
       echo "MAX_JOBS=${NUM_THREADS}" >> $GITHUB_ENV
diff --git a/.github/actions/nm-summary-build/action.yml b/.github/actions/nm-summary-build/action.yml
index a890f5ad016f2..6f0eb3b457a82 100644
--- a/.github/actions/nm-summary-build/action.yml
+++ b/.github/actions/nm-summary-build/action.yml
@@ -13,9 +13,6 @@ inputs:
   python:
     description: 'python version info'
     required: true
-  build_status:
-    description: 'status from build step'
-    required: true
   whl_status:
     description: 'status from build step'
     required: true
@@ -37,6 +34,5 @@ runs:
       echo "| gitref: | '${{ inputs.gitref }}' |" >> $GITHUB_STEP_SUMMARY
       echo "| branch name: | '${{ github.ref_name }}' |" >> $GITHUB_STEP_SUMMARY
       echo "| python: | ${{ inputs.python }} |" >> $GITHUB_STEP_SUMMARY
-      echo "| build: | ${BUILD_EMOJI} |" >> $GITHUB_STEP_SUMMARY
       echo "| whl: | ${WHL_EMOJI} |" >> $GITHUB_STEP_SUMMARY
     shell: bash
diff --git a/.github/actions/nm-test-whl/action.yml b/.github/actions/nm-test-whl/action.yml
index 53ca57598f8f8..557374fa11b08 100644
--- a/.github/actions/nm-test-whl/action.yml
+++ b/.github/actions/nm-test-whl/action.yml
@@ -19,13 +19,15 @@ runs:
   steps:
     - id: test_whl
       run: |
+        sudo mkdir -m 777 -p /usr/local/apps
+        sudo chown -R $(whoami):$(whoami) /usr/local/apps
         pip install coverage
         pip install pytest-cov
         pip install pytest-xdist
         pip install -r requirements-dev.txt
         SUCCESS=0
         VLLM_SRC=$(python3 -c "import vllm; print(vllm.__path__[0])")
-        ./.github/scripts/run-tests -s ${VLLM_SRC} -t ${{ inputs.test_directory }} -r ${{ inputs.test_results }} -f ${{ inputs.test_skip_list }}|| SUCCESS=$?
+        ./.github/scripts/run-tests -s ${VLLM_SRC} -t ${{ inputs.test_directory }} -r ${{ inputs.test_results }} || SUCCESS=$?
         pytest ./neuralmagic/tests/test_nm-vllm_licenses.py --junitxml=${{ inputs.test_results }}/test_nm-vllm_licenses.xml
         echo "status=${SUCCESS}" >> "$GITHUB_OUTPUT"
         exit ${SUCCESS}
diff --git a/.github/scripts/run-tests b/.github/scripts/run-tests
index bea7fffd7a93a..e2e12772d0816 100755
--- a/.github/scripts/run-tests
+++ b/.github/scripts/run-tests
@@ -8,7 +8,6 @@ usage() {
     echo "  -s    - src directory, i.e. location of package *.py files."
     echo "  -t    - test directory, i.e. location of *.py test files. (default 'tests/')"
     echo "  -r    - desired results base directory. xml results will mirror provided tests directory structure. (default 'test-results/')"
-    echo "  -f    - file with test skip list, e.g. ' neuralmagic/tests/skip-for-remote-push.txt'. (default is to run all found tests)"
     echo "  -h    - this list of options"
     echo
     echo "note: all paths are relative to 'nm-vllm' root"
@@ -35,9 +34,6 @@ while getopts "hs:t:r:f:" OPT; do
     r)
         RESULTS_DIR="${OPTARG}"
         ;;
-    f)
-        SKIP_LIST="${OPTARG}"
-        ;;
     esac
 done
 
@@ -71,36 +67,6 @@ for FOUND in "${TESTS_FOUND[@]}"; do
     echo "${FOUND}"
 done
 
-# build the skip list from provided file
-declare -a TESTS_TO_EXCLUDE
-if [ -f "${SKIP_LIST}" ]; then
-    while IFS= read -r line
-    do
-        TESTS_TO_EXCLUDE+=("${line}")
-    done < "${SKIP_LIST}"
-fi
-
-echo "..."
-for EXCLUDE in "${TESTS_TO_EXCLUDE[@]}"; do
-    for JJ in "${!TESTS_FOUND[@]}"; do
-        if [[ ${TESTS_FOUND[$JJ]} = ${EXCLUDE} ]]; then
-            echo "excluding: ${EXCLUDE}"
-            unset 'TESTS_FOUND[$JJ]'
-        fi
-    done
-done
-
-echo "..."
-echo "planning to run:"
-for TEST in "${TESTS_FOUND[@]}"
-do
-    echo "${TEST}"
-done
-echo "..."
-
-# download required artifacts for testing
-# (cd ${TEST_DIR} && sudo bash ../.buildkite/download-images.sh)
-
 # run selected tests
 SUCCESS=0
 CC_PYTEST_FLAGS="--cov=${SRC_DIR} --cov=${TEST_DIR} --cov-report=html:cc-vllm-html --cov-append"
diff --git a/.github/workflows/nm-benchmark.yml b/.github/workflows/nm-benchmark.yml
index bac8133fd5c97..9ab70c84a357c 100644
--- a/.github/workflows/nm-benchmark.yml
+++ b/.github/workflows/nm-benchmark.yml
@@ -29,7 +29,7 @@ on:
         required: true
       push_benchmark_results_to_gh_pages:
         description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI"
-        type: string
+        type: boolean
         required: true
 
   # makes workflow manually callable
@@ -61,11 +61,8 @@ on:
         required: true
       push_benchmark_results_to_gh_pages:
         description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI"
-        type: choice
-        options:
-          - 'true'
-          - 'false'
-        default: 'false'
+        type: boolean
+        default: false
 
 env:
     BENCHMARK_RESULTS: /model-cache/benchmark_results
@@ -109,11 +106,9 @@ jobs:
           Gi_per_thread: 1
           nvcc_threads: 0
 
-      - name: hf cache
-        id: hf_cache
-        uses: ./.github/actions/nm-hf-cache/
-        with:
-          fs_cache: ${{ secrets.HF_FS_CACHE }}
+      - name: caches
+        id: caches
+        uses: ./.github/actions/nm-caches/
 
       - name: download whl
         id: download
diff --git a/.github/workflows/nm-build-test.yml b/.github/workflows/nm-build-test.yml
index 1a037a837ed58..e85dd2b199745 100644
--- a/.github/workflows/nm-build-test.yml
+++ b/.github/workflows/nm-build-test.yml
@@ -1,27 +1,29 @@
 name: nm build-test
 on:
+
   # makes workflow reusable
   workflow_call:
     inputs:
       wf_category:
-        description: "categories: REMOTE, NIGHTLY, WEEKLY, RELEASE"
+        description: "workflow category: REMOTE, NIGHTLY, RELEASE"
         type: string
         default: "REMOTE"
-      push_binaries_to_pypi:
+      push_to_pypi:
         description: "When set to true, built wheels and tar.gz will be pushed to neuralmagic pypi if all tests pass"
-        type: string
-        default: 'no'
+        type: boolean
+        default: false
       python:
         description: "python version, e.g. 3.10.12"
         type: string
         required: true
+
       # build related parameters
       build_label:
         description: "requested runner label (specifies instance)"
         type: string
         default: gcp-k8s-build
       build_timeout:
-        description: "time limit for build in minutes "
+        description: "time limit for build in minutes"
         type: string
         default: "120"
       Gi_per_thread:
@@ -32,27 +34,29 @@ on:
         description: "number of threads nvcc build threads"
         type: string
         default: "8"
+
       # test related parameters
-      test_label_solo:
-        description: "requested runner label (specifies instance)"
-        type: string
-        required: true
-      test_label_multi:
-        description: "requested runner label (specifies instance)"
+
+      # stringified Json array of maps
+      # each map has a "python", "gha label", "test skip env vars" e.g.
+      # [
+      #     {'python':'3.8.17','label':'gcp-k8s-l4-solo','test':'neuralmagic/tests/test_skip_env_vars/smoke.txt'},
+      #     ...
+      # ]
+      test_configs:
+        description: "python, label, skip envs"
         type: string
         required: true
+
       test_timeout:
-        description: "time limit for test run in minutes "
+        description: "time limit for test run in minutes"
         type: string
         required: true
       gitref:
         description: "git commit hash or branch name"
         type: string
         required: true
-      test_skip_env_vars:
-        description: 'file with list of env vars controlling which tests to run'
-        type: string
-        required: true
+
       # benchmark related parameters
       benchmark_label:
         description: "requested benchmark label (specifies instance)"
@@ -67,9 +71,10 @@ on:
         type: string
         default: "720"
       push_benchmark_results_to_gh_pages:
-        description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI"
-        type: string
-        default: "false"
+        description: "when set to true, the workflow pushes all benchmarking results to gh-pages UI"
+        type: boolean
+        default: false
+
       # lm-eval related parameters
       lm_eval_label:
         description: "requested runner label (specifies instance)"
@@ -82,11 +87,24 @@ on:
       lm_eval_configuration:
         description: "configuration for lm-eval test (see neuralmagic/lm-eval)"
         type: string
-        default: "" 
+        default: ""
 
 jobs:
 
+    JSON-VALIDATE:
+        runs-on: gcp-k8s-util
+        strategy:
+            matrix:
+                test_config: ${{ fromJson(inputs.test_configs) }}
+        steps:
+            - name: validate test config
+              run: |
+                echo "python: ${{ matrix.test_config.python }}"
+                echo "label: ${{ matrix.test_config.label }}"
+                echo "tests: ${{ matrix.test_config.test }}"
+
     BUILD:
+        needs: [JSON-VALIDATE]
         uses: ./.github/workflows/nm-build.yml
         with:
             wf_category: ${{ inputs.wf_category }}
@@ -98,33 +116,23 @@ jobs:
             python: ${{ inputs.python }}
         secrets: inherit
 
-    TEST-SOLO:
+    TEST:
         needs: [BUILD]
         if: success()
+        strategy:
+            fail-fast: false
+            matrix:
+                test_config: ${{ fromJson(inputs.test_configs) }}
         uses: ./.github/workflows/nm-test.yml
         with:
-            test_label: ${{ inputs.test_label_solo }}
+            test_label: ${{ matrix.test_config.label }}
             timeout: ${{ inputs.test_timeout }}
             gitref: ${{ github.ref }}
-            python: ${{ inputs.python }}
-            whl: ${{ needs.BUILD.outputs.whl }}
-            test_skip_env_vars: ${{ inputs.test_skip_env_vars }}
+            python: ${{ matrix.test_config.python }}
+            whl: ${{ needs.BUILD.output.whl }}
+            test_skip_env_vars: ${{ matrix.test_config.test }}
         secrets: inherit
 
-    # TODO: re-enable
-    # TEST-MULTI:
-    #     needs: [BUILD]
-    #     if: success() && contains(fromJSON('["NIGHTLY", "WEEKLY", "RELEASE"]'), inputs.wf_category)
-    #     uses: ./.github/workflows/nm-test.yml
-    #     with:
-    #         test_label: ${{ inputs.test_label_multi }}
-    #         timeout: ${{ inputs.test_timeout }}
-    #         gitref: ${{ github.ref }}
-    #         python: ${{ inputs.python }}
-    #         whl: ${{ needs.BUILD.outputs.whl }}
-    #         test_skip_env_vars: ${{ inputs.test_skip_env_vars }}
-    #     secrets: inherit
-
     BENCHMARK:
         needs: [BUILD]
         if: success()
@@ -140,7 +148,7 @@ jobs:
             push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
         secrets: inherit
 
-    LM-EVAL-SOLO:
+    LM-EVAL:
       needs: [BUILD]
       uses: ./.github/workflows/nm-lm-eval.yml
       with:
@@ -152,12 +160,13 @@ jobs:
         lm_eval_configuration: ${{ inputs.lm_eval_configuration }}
       secrets: inherit
 
+    # uploading is only available when using GCP autoscaling group
     UPLOAD:
-        needs: [TEST-SOLO, BENCHMARK, LM-EVAL-SOLO]
-        if: ${{ contains(fromJSON('["NIGHTLY", "WEEKLY", "RELEASE"]'), inputs.wf_category) && inputs.push_binaries_to_pypi == 'yes' }}
+        needs: [TEST, BENCHMARK, LM-EVAL]
+        if: ${{ inputs.push_to_pypi }}
         uses: ./.github/workflows/nm-upload-assets-to-gcp.yml
         with:
-            label: ${{ inputs.build_label }}
+            label: gcp-k8s-util
             timeout: ${{ inputs.build_timeout }}
             gitref: ${{ github.ref }}
             python: ${{ inputs.python }}
diff --git a/.github/workflows/nm-build.yml b/.github/workflows/nm-build.yml
index 5ac43dd8db1bc..d8672a5118971 100644
--- a/.github/workflows/nm-build.yml
+++ b/.github/workflows/nm-build.yml
@@ -73,7 +73,13 @@ jobs:
 
         runs-on: ${{ inputs.build_label }}
         timeout-minutes: ${{ fromJson(inputs.timeout) }}
+
+        permissions:
+            contents: 'read'
+            id-token: 'write'
+
         outputs:
+            run_id: ${{ github.run_id }}
             whl: ${{ steps.build.outputs.whl }}
             tarfile: ${{ steps.build.outputs.tarfile }}
 
@@ -118,7 +124,27 @@ jobs:
               with:
                 python: ${{ inputs.python }}
                 venv: ${{ env.VENV_BASE }}
-                pypi: ${{ secrets.NM_PRIVATE_PYPI_LOCATION }}
+
+            # GCP
+            - name: 'Authenticate to Google Cloud'
+              id: auth
+              uses: google-github-actions/auth@v2.1.3
+              with:
+                  project_id: ${{ secrets.GCP_PROJECT }}
+                  workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}
+                  service_account: ${{ secrets.NM_PYPI_SA }}
+
+            - name: 'Set up Cloud SDK'
+              uses: 'google-github-actions/setup-gcloud@v2'
+              with:
+                  version: '>= 473.0.0'
+
+            - name: copy whl and source distribution
+              run: |
+                # echo "whl: ${{ steps.build.outputs.whl }}"
+                # echo "tarfile: ${{ steps.build.outputs.tarfile }}"
+                gcloud storage cp dist/${{ steps.build.outputs.whl }} gs://neuralmagic-public-pypi/assets/${{ github.run_id }}/${{ steps.build.outputs.whl }}
+                gcloud storage cp dist/${{ steps.build.outputs.tarfile }} gs://neuralmagic-public-pypi/assets/${{ github.run_id }}/${{ steps.build.outputs.tarfile }}
 
             - name: upload whl
               uses: actions/upload-artifact@v4
@@ -126,7 +152,7 @@ jobs:
               with:
                 name: ${{ steps.build.outputs.whl }}
                 path: dist/${{ steps.build.outputs.whl }}
-                retention-days: 15
+                retention-days: 5
 
             - name: upload tar.gz
               uses: actions/upload-artifact@v4
@@ -144,7 +170,6 @@ jobs:
                 gitref: ${{ inputs.gitref }}
                 testmo_run_url: https://neuralmagic.testmo.net/automation/runs/view/${{ steps.create_testmo_run.outputs.id }}
                 python: ${{ steps.set_python.outputs.version }}
-                build_status: ${{ steps.build.outputs.build_status }}
                 whl_status: ${{ steps.build.outputs.whl_status }}
 
             - name: run status
@@ -154,9 +179,7 @@ jobs:
                 BUILD_STATUS: ${{ steps.build.outputs.build_status }}
                 WHL_STATUS: ${{ steps.build.outputs.whl_status }}
               run: |
-                echo "build status: ${BUILD_STATUS}"
                 echo "build status: ${WHL_STATUS}"
-                if [ -z "${BUILD_STATUS}" ] || [ "${BUILD_STATUS}" -ne "0" ]; then exit 1; fi
                 if [ -z "${WHL_STATUS}" ] || [ "${WHL_STATUS}" -ne "0" ]; then exit 1; fi
 
             - name: complete testmo run
diff --git a/.github/workflows/nm-lm-eval.yml b/.github/workflows/nm-lm-eval.yml
index 90b7ec61a0a3a..4ffeb76ef9f15 100644
--- a/.github/workflows/nm-lm-eval.yml
+++ b/.github/workflows/nm-lm-eval.yml
@@ -91,11 +91,9 @@ jobs:
           Gi_per_thread: ${{ inputs.Gi_per_thread }}
           nvcc_threads: ${{ inputs.nvcc_threads }}
 
-      - name: hf cache
-        id: hf_cache
-        uses: ./.github/actions/nm-hf-cache/
-        with:
-          fs_cache: ${{ secrets.HF_FS_CACHE }}
+      - name: caches
+        id: caches
+        uses: ./.github/actions/nm-caches/
 
       - name: download whl
         id: download
diff --git a/.github/workflows/nm-nightly.yml b/.github/workflows/nm-nightly.yml
index 7de93e4b5ae40..89b2e6ea3074e 100644
--- a/.github/workflows/nm-nightly.yml
+++ b/.github/workflows/nm-nightly.yml
@@ -3,110 +3,45 @@ run-name: ${{ github.actor }} triggered nightly on ${{ github.ref }}
 on:
     schedule:
       # * is a special character in YAML so you have to quote this string
-      - cron: '0 1 * * 1-6'  # nightly run (Mon-Sat)
+      - cron: '0 1 * * *'  # nightly run
 
     workflow_dispatch:
         inputs:
-            push_benchmark_results_to_gh_pages:
-                description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI "
+            wf_category:
+                description: "workflow category, default is NIGHTLY"
                 type: choice
                 options:
-                    - 'true'
-                    - 'false'
-                default: 'false'
-            push_binaries_to_pypi:
-                description: "When set to yes, built wheels and tar.gz will be pushed to neuralmagic pypi if all tests pass"
-                type: string
-                default: 'yes'
+                    - NIGHTLY
+                    - RELEASE
+                default: NIGHTLY
+            push_to_pypi:
+                description: "when set and tests pass, then '.whl' and '.tar.gz' will be pushed to neuralmagic pypi"
+                type: boolean
+                default: false
+            push_benchmark_results_to_gh_pages:
+                description: "when set, then all benchmarking results are published to gh-pages UI "
+                type: boolean
+                default: false
 
 jobs:
 
-    PYTHON-3-8:
-        uses: ./.github/workflows/nm-build-test.yml
-        with:
-            wf_category: NIGHTLY
-            python: 3.8.17
-            gitref: ${{ github.ref }}
-            push_binaries_to_pypi: ${{ inputs.push_binaries_to_pypi }}
-
-            test_label_solo: gcp-k8s-l4-solo
-            test_label_multi: ignore
-            test_timeout: 480
-            test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt
-
-            benchmark_label: gcp-k8s-l4-solo
-            benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
-            benchmark_timeout: 720
-            push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
-
-            lm_eval_label: gcp-k8s-l4-solo
-            lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml
-            lm_eval_timeout: 60
-        secrets: inherit
-
-    PYTHON-3-9:
-        uses: ./.github/workflows/nm-build-test.yml
-        with:
-            wf_category: NIGHTLY
-            python: 3.9.17
-            gitref: ${{ github.ref }}
-            push_binaries_to_pypi: ${{ inputs.push_binaries_to_pypi }}
-
-            test_label_solo: gcp-k8s-l4-solo
-            test_label_multi: ignore
-            test_timeout: 480
-            test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt
-
-            benchmark_label: gcp-k8s-l4-solo
-            benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
-            benchmark_timeout: 720
-            push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
-
-            lm_eval_label: gcp-k8s-l4-solo
-            lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml
-            lm_eval_timeout: 60
-        secrets: inherit
-
-    PYTHON-3-10:
+    NIGHTLY:
         uses: ./.github/workflows/nm-build-test.yml
         with:
-            wf_category: NIGHTLY
+            wf_category: ${{ inputs.wf_category || 'NIGHTLY' }}
             python: 3.10.12
             gitref: ${{ github.ref }}
-            push_binaries_to_pypi: ${{ inputs.push_binaries_to_pypi }}
-
-            test_label_solo: gcp-k8s-l4-solo
-            test_label_multi: ignore
-            test_timeout: 480
-            test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt
-
-            benchmark_label: gcp-k8s-l4-solo
-            benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
-            benchmark_timeout: 720
-            push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
-
-            lm_eval_label: gcp-k8s-l4-solo
-            lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml
-            lm_eval_timeout: 60
-        secrets: inherit
-
-    PYTHON-3-11:
-        uses: ./.github/workflows/nm-build-test.yml
-        with:
-            wf_category: NIGHTLY
-            python: 3.11.4
-            gitref: ${{ github.ref }}
-            push_binaries_to_pypi: ${{ inputs.push_binaries_to_pypi }}
+            push_to_pypi: ${{ inputs.push_to_pypi }}
 
-            test_label_solo: gcp-k8s-l4-solo
-            test_label_multi: ignore
+            test_configs: '[{"python":"3.8.17","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/full.txt"},
+                            {"python":"3.9.17","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/full.txt"},
+                            {"python":"3.10.12","label":"gcp-k8s-l4-duo","test":"neuralmagic/tests/test_skip_env_vars/full.txt"},
+                            {"python":"3.11.4","label":"gcp-k8s-l4-duo","test":"neuralmagic/tests/test_skip_env_vars/full.txt"}]'
             test_timeout: 480
-            test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt
-
 
             benchmark_label: gcp-k8s-l4-solo
             benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
-            benchmark_timeout: 720
+            benchmark_timeout: 480
             push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
 
             lm_eval_label: gcp-k8s-l4-solo
diff --git a/.github/workflows/nm-release.yml b/.github/workflows/nm-release.yml
deleted file mode 100644
index 76040df023c8e..0000000000000
--- a/.github/workflows/nm-release.yml
+++ /dev/null
@@ -1,110 +0,0 @@
-name: nm release
-run-name: ${{ github.actor }} verifying branch '${{ github.ref }}'
-on:
-  workflow_dispatch:
-    inputs:
-      push_benchmark_results_to_gh_pages:
-        description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI"
-        type: choice
-        options:
-          - 'true'
-          - 'false'
-        default: 'false'
-      push_binaries_to_pypi:
-        description: "When set to yes, built wheels and tar.gz will be pushed to neuralmagic pypi if all tests pass"
-        type: string
-        default: 'no'
-
-jobs:
-
-  PYTHON-3-8:
-    uses: ./.github/workflows/nm-build-test.yml
-    with:
-      wf_category: 'RELEASE'
-      python: 3.8.17
-      gitref: ${{ github.ref }}
-      push_binaries_to_pypi: ${{ inputs.push_binaries_to_pypi }}
-
-      test_label_solo: gcp-k8s-l4-solo
-      test_label_multi: ignore
-      test_timeout: 720
-      test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt
-
-      benchmark_label: gcp-k8s-l4-solo
-      benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt
-      benchmark_timeout: 720
-      push_benchmark_results_to_gh_pages: ${{ inputs.push_benchmark_results_to_gh_pages }}
-
-      lm_eval_label: gcp-k8s-l4-solo
-      lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml
-      lm_eval_timeout: 60
-    secrets: inherit
-
-  PYTHON-3-9:
-    uses: ./.github/workflows/nm-build-test.yml
-    with:
-      wf_category: 'RELEASE'
-      python: 3.9.17
-      gitref: ${{ github.ref }}
-      push_binaries_to_pypi: ${{ inputs.push_binaries_to_pypi }}
-
-      test_label_solo: gcp-k8s-l4-solo
-      test_label_multi: ignore
-      test_timeout: 720
-      test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt
-
-      benchmark_label: gcp-k8s-l4-solo
-      benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt
-      benchmark_timeout: 720
-      push_benchmark_results_to_gh_pages: ${{ inputs.push_benchmark_results_to_gh_pages }}
-
-      lm_eval_label: gcp-k8s-l4-solo
-      lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml
-      lm_eval_timeout: 60
-    secrets: inherit
-
-  PYTHON-3-10:
-    uses: ./.github/workflows/nm-build-test.yml
-    with:
-      wf_category: 'RELEASE'
-      python: 3.10.12
-      gitref: ${{ github.ref }}
-      push_binaries_to_pypi: ${{ inputs.push_binaries_to_pypi }}
-
-      test_label_solo: gcp-k8s-l4-solo
-      test_label_multi: ignore
-      test_timeout: 720
-      test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt
-
-      benchmark_label: gcp-k8s-l4-solo
-      benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt
-      benchmark_timeout: 720
-      push_benchmark_results_to_gh_pages: ${{ inputs.push_benchmark_results_to_gh_pages }}
-
-      lm_eval_label: gcp-k8s-l4-solo
-      lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml
-      lm_eval_timeout: 60
-    secrets: inherit
-
-  PYTHON-3-11:
-    uses: ./.github/workflows/nm-build-test.yml
-    with:
-      wf_category: 'RELEASE'
-      python: 3.11.4
-      gitref: ${{ github.ref }}
-      push_binaries_to_pypi: ${{ inputs.push_binaries_to_pypi }}
-
-      test_label_solo: gcp-k8s-l4-solo
-      test_label_multi: ignore
-      test_timeout: 720
-      test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt
-
-      benchmark_label: gcp-k8s-l4-solo
-      benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt
-      benchmark_timeout: 720
-      push_benchmark_results_to_gh_pages: ${{ inputs.push_benchmark_results_to_gh_pages }}
-
-      lm_eval_label: gcp-k8s-l4-solo
-      lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml
-      lm_eval_timeout: 60
-    secrets: inherit
diff --git a/.github/workflows/nm-remote-push.yml b/.github/workflows/nm-remote-push.yml
index 4a9171fcdcc0e..b23a2f9389512 100644
--- a/.github/workflows/nm-remote-push.yml
+++ b/.github/workflows/nm-remote-push.yml
@@ -12,80 +12,18 @@ concurrency:
 
 jobs:
 
-    BUILD-TEST-3-8:
-        uses: ./.github/workflows/nm-build-test.yml
-        with:
-            python: 3.8.17
-            gitref: ${{ github.ref }}
-            push_binaries_to_pypi: 'no'
-
-            test_label_solo: gcp-k8s-l4-solo
-            test_label_multi: ignore
-            test_timeout: 480
-            test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt
-
-            benchmark_label: gcp-k8s-l4-solo
-            benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
-            benchmark_timeout: 480
-
-            lm_eval_label: gcp-k8s-l4-solo
-            lm_eval_configuration: ./neuralmagic/lm-eval/smoke-small-models.yaml
-            lm_eval_timeout: 60
-        secrets: inherit
-
-    BUILD-TEST-3-9:
-        uses: ./.github/workflows/nm-build-test.yml
-        with:
-            python: 3.9.17
-            gitref: ${{ github.ref }}
-            push_binaries_to_pypi: 'no'
-
-            test_label_solo: gcp-k8s-l4-solo
-            test_label_multi: ignore
-            test_timeout: 480
-            test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt
-
-            benchmark_label: gcp-k8s-l4-solo
-            benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
-            benchmark_timeout: 480
-
-            lm_eval_label: gcp-k8s-l4-solo
-            lm_eval_configuration: ./neuralmagic/lm-eval/smoke-small-models.yaml
-            lm_eval_timeout: 60
-        secrets: inherit
-
-    BUILD-TEST-3-10:
+    REMOTE:
         uses: ./.github/workflows/nm-build-test.yml
         with:
             python: 3.10.12
             gitref: ${{ github.ref }}
-            push_binaries_to_pypi: 'no'
-
-            test_label_solo: gcp-k8s-l4-solo
-            test_label_multi: ignore
-            test_timeout: 480
-            test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt
-
-            benchmark_label: gcp-k8s-l4-solo
-            benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
-            benchmark_timeout: 480
-
-            lm_eval_label: gcp-k8s-l4-solo
-            lm_eval_configuration: ./neuralmagic/lm-eval/smoke-small-models.yaml
-            lm_eval_timeout: 60
-        secrets: inherit
-
-    BUILD-TEST-3-11:
-        uses: ./.github/workflows/nm-build-test.yml
-        with:
-            python: 3.11.4
-            gitref: ${{ github.ref }}
-            push_binaries_to_pypi: 'no'
+            push_to_pypi: false
 
-            test_label_solo: gcp-k8s-l4-solo
-            test_label_multi: ignore
+            test_configs: '[{"python":"3.8.17","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/smoke.txt"},
+                            {"python":"3.9.17","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/smoke.txt"},
+                            {"python":"3.10.12","label":"gcp-k8s-l4-duo","test":"neuralmagic/tests/test_skip_env_vars/smoke.txt"},
+                            {"python":"3.11.4","label":"gcp-k8s-l4-duo","test":"neuralmagic/tests/test_skip_env_vars/smoke.txt"}]'
             test_timeout: 480
-            test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt
 
             benchmark_label: gcp-k8s-l4-solo
             benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
diff --git a/.github/workflows/nm-test.yml b/.github/workflows/nm-test.yml
index a8b45ccbe61fe..01d6fa96730d3 100644
--- a/.github/workflows/nm-test.yml
+++ b/.github/workflows/nm-test.yml
@@ -4,7 +4,7 @@ on:
   workflow_call:
     inputs:
       test_label:
-        description: "requested runner label (specifies instance)"
+        description: "requested runner label"
         type: string
         required: true
       timeout:
@@ -32,7 +32,7 @@ on:
   workflow_dispatch:
     inputs:
       test_label:
-        description: "requested runner label (specifies instance)"
+        description: "requested runner label"
         type: string
         required: true
       timeout:
@@ -109,11 +109,9 @@ jobs:
               id: verify_python
               uses: ./.github/actions/nm-verify-python/
 
-            - name: hf cache
-              id: hf_cache
-              uses: ./.github/actions/nm-hf-cache/
-              with:
-                fs_cache: ${{ secrets.HF_FS_CACHE }}
+            - name: caches
+              id: caches
+              uses: ./.github/actions/nm-caches/
 
             - name: download whl
               id: download
diff --git a/.github/workflows/nm-upload-assets-to-gcp.yml b/.github/workflows/nm-upload-assets-to-gcp.yml
index bfade2a90d2f9..5db9be07b8ff4 100644
--- a/.github/workflows/nm-upload-assets-to-gcp.yml
+++ b/.github/workflows/nm-upload-assets-to-gcp.yml
@@ -33,6 +33,12 @@ jobs:
 
         steps:
 
+            - name: install automation components
+              run: |
+                sudo apt-get update --fix-missing
+                sudo apt-get install -y git-all
+                sudo apt-get install -y curl
+
             - name: checkout
               id: checkout
               uses: actions/checkout@v4
diff --git a/.github/workflows/nm-weekly.yml b/.github/workflows/nm-weekly.yml
deleted file mode 100644
index 4f3ce2c4f9f79..0000000000000
--- a/.github/workflows/nm-weekly.yml
+++ /dev/null
@@ -1,41 +0,0 @@
-name: nm Weekly
-run-name: ${{ github.actor }} triggered weekly on ${{ github.ref }}
-on:
-  schedule:
-    # * is a special character in YAML so you have to quote this string
-    - cron: '0 1 * * 0'  # weekly run (Sun)
-
-  workflow_dispatch:
-    inputs:
-      push_benchmark_results_to_gh_pages:
-        description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI"
-        type: choice
-        options:
-          - 'true'
-          - 'false'
-        default: 'false'
-
-jobs:
-
-  BUILD-TEST:
-    uses: ./.github/workflows/nm-build-test.yml
-    with:
-      wf_category: WEEKLY
-      python: 3.10.12
-      gitref: ${{ github.ref }}
-      push_binaries_to_pypi: 'no'
-
-      test_label_solo: aws-avx2-32G-a10g-24G
-      test_label_multi: aws-avx2-192G-4-a10g-96G
-      test_timeout: 480
-      test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt
-
-      benchmark_label: aws-avx2-32G-a10g-24G
-      benchmark_config_list_file: ./.github/data/nm_benchmark_weekly_configs_list.txt
-      benchmark_timeout: 720
-      push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
-
-      lm_eval_label: gcp-k8s-l4-solo
-      lm_eval_configuration: ./neuralmagic/lm-eval/full-samll-models.yaml
-      lm_eval_timeout: 60
-    secrets: inherit
diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py
index 8a68dceac136c..c6453f6a93a6a 100644
--- a/tests/engine/test_stop_strings.py
+++ b/tests/engine/test_stop_strings.py
@@ -1,6 +1,7 @@
 from typing import Any, List, Optional
 
 import pytest
+import torch
 
 from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm import CompletionOutput, LLMEngine, SamplingParams
@@ -19,6 +20,8 @@ def vllm_model(vllm_runner):
         yield vllm_model
 
 
+@pytest.mark.skipif(torch.cuda.get_device_capability() < (8, 0),
+                    reason="T4 KV cache constraints")
 @pytest.mark.skip_global_cleanup
 def test_stop_basic(vllm_model):
     _test_stopping(vllm_model.model.llm_engine,
@@ -34,6 +37,8 @@ def test_stop_basic(vllm_model):
                    expected_reason=".")
 
 
+@pytest.mark.skipif(torch.cuda.get_device_capability() < (8, 0),
+                    reason="T4 KV cache constraints")
 @pytest.mark.skip_global_cleanup
 def test_stop_multi_tokens(vllm_model):
     _test_stopping(
@@ -52,6 +57,8 @@ def test_stop_multi_tokens(vllm_model):
         expected_reason="group of peo")
 
 
+@pytest.mark.skipif(torch.cuda.get_device_capability() < (8, 0),
+                    reason="T4 KV cache constraints")
 @pytest.mark.skip_global_cleanup
 def test_stop_partial_token(vllm_model):
     _test_stopping(vllm_model.model.llm_engine,
@@ -67,6 +74,8 @@ def test_stop_partial_token(vllm_model):
                    expected_reason="gani")
 
 
+@pytest.mark.skipif(torch.cuda.get_device_capability() < (8, 0),
+                    reason="T4 KV cache constraints")
 @pytest.mark.skip_global_cleanup
 def test_stop_token_id(vllm_model):
     # token id 13013 => " organization"
diff --git a/tests/entrypoints/test_llm_generate_multiple_loras.py b/tests/entrypoints/test_llm_generate_multiple_loras.py
index 8401e9836f1ac..110f9c2aafafd 100644
--- a/tests/entrypoints/test_llm_generate_multiple_loras.py
+++ b/tests/entrypoints/test_llm_generate_multiple_loras.py
@@ -1,6 +1,7 @@
 import weakref
 
 import pytest
+import torch
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 
@@ -54,6 +55,11 @@ def zephyr_lora_files():
     return snapshot_download(repo_id=LORA_NAME)
 
 
+@pytest.mark.skipif(
+    torch.cuda.get_device_capability() < (8, 0),
+    reason=
+    "Bfloat16 is only supported on GPUs with compute capability of at least 8.0"
+)
 @pytest.mark.skip_global_cleanup
 def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
     lora_request = [
diff --git a/tests/models_core/test_llm_logprobs.py b/tests/models_core/test_llm_logprobs.py
index be776637c87f6..65186aafabf48 100644
--- a/tests/models_core/test_llm_logprobs.py
+++ b/tests/models_core/test_llm_logprobs.py
@@ -6,6 +6,7 @@
 Run `pytest tests/models/test_models_logprobs.py`.
 """
 import pytest
+import torch
 
 from tests.models.utils import check_logprobs_close
 from tests.nm_utils.utils_skip import should_skip_test_group
@@ -24,6 +25,8 @@
 ]
 
 
+@pytest.mark.skipif(torch.cuda.get_device_capability() < (8, 0),
+                    reason="T4 memory constraints")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
diff --git a/tests/models_core/test_magic_wand.py b/tests/models_core/test_magic_wand.py
index f4f4831f4f1da..f31fb1885506b 100644
--- a/tests/models_core/test_magic_wand.py
+++ b/tests/models_core/test_magic_wand.py
@@ -1,6 +1,6 @@
 """Compare the outputs of a sparse model vs sparse model running dense.
-Note: sparse kernels do not have bitwise correctness vs the dense models. 
-As a result, in this test, we just confirm that the top selected tokens of the 
+Note: sparse kernels do not have bitwise correctness vs the dense models.
+As a result, in this test, we just confirm that the top selected tokens of the
 sparse models are in the top N selections of same model running dense.
 
 Run `pytest tests/models_core/test_magic_wand.py`.
@@ -9,6 +9,7 @@
 import gc
 
 import pytest
+import torch
 
 from tests.models.utils import check_logprobs_close
 from tests.nm_utils.utils_skip import should_skip_test_group
@@ -25,6 +26,8 @@
 ]
 
 
+@pytest.mark.skipif(torch.cuda.get_device_capability() < (8, 0),
+                    reason="skip for T4s, requires compute capability 8.0")
 @pytest.mark.parametrize("model_format_extrablocks", MODEL_FORMAT_EXTRABLOCKS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
diff --git a/tests/models_core/test_server_logprobs.py b/tests/models_core/test_server_logprobs.py
index 48a4d7bece18b..023d63e3b93ae 100644
--- a/tests/models_core/test_server_logprobs.py
+++ b/tests/models_core/test_server_logprobs.py
@@ -67,6 +67,8 @@ async def my_chat(
                                                 top_logprobs=num_logprobs)
 
 
+@pytest.mark.skipif(torch.cuda.get_device_capability() < (8, 0),
+                    reason="T4 memory constraints")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py
index 1e2dc9197b403..4c28f859e793e 100644
--- a/tests/prefix_caching/test_disable_sliding_window.py
+++ b/tests/prefix_caching/test_disable_sliding_window.py
@@ -3,6 +3,7 @@
 Run `pytest tests/prefix_caching/test_prefix_caching.py`.
 """
 import pytest
+import torch
 
 from tests.conftest import cleanup
 from tests.nm_utils.utils_skip import should_skip_test_group
@@ -26,6 +27,11 @@
 ]
 
 
+@pytest.mark.skipif(
+    torch.cuda.get_device_capability() < (8, 0),
+    reason=
+    "Bfloat16 is only supported on GPUs with compute capability of at least 8.0"
+)
 @pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN)
 def test_disable_sliding_window(model_len_len, ):
     model, sliding_len, full_len = model_len_len
diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py
index 3b7dcbc5983fc..db35679b03acb 100644
--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -6,6 +6,7 @@
 from dataclasses import dataclass
 
 import pytest
+import torch
 
 from tests.nm_utils.utils_skip import should_skip_test_group
 from vllm.config import ModelConfig
@@ -55,6 +56,8 @@ class ModelPair:
 ]
 
 
+@pytest.mark.skipif(torch.cuda.get_device_capability() < (8, 0),
+                    reason="skip for T4s, requires compute capability 8.0")
 @pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES)
 def test_auto_gptq(model_arg_exptype: str) -> None:
     model_path, quantization_arg, expected_type = model_arg_exptype
diff --git a/tests/test_regression.py b/tests/test_regression.py
index 5d27d35793017..46298d6dc16fa 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -6,6 +6,7 @@
 """
 import gc
 
+import pytest
 import torch
 
 from vllm import LLM, SamplingParams
@@ -53,6 +54,11 @@ def test_gc():
     assert allocated < 50 * 1024 * 1024
 
 
+@pytest.mark.skipif(
+    torch.cuda.get_device_capability() < (8, 0),
+    reason=
+    "Bfloat16 is only supported on GPUs with compute capability of at least 8.0"
+)
 def test_model_from_modelscope(monkeypatch):
     # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
     MODELSCOPE_MODEL_NAME = "qwen/Qwen1.5-0.5B-Chat"