Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Commit

Permalink
Upstream sync 2024 06 16 (#307)
Browse files Browse the repository at this point in the history
Upstream sync 2024 06 16
(#310) - v0.5.0.post of vllm

SUMMARY:

* Merge commits from
vllm-project@8f89d72
to
vllm-project@0f0d8bc
* Limit numpy to < 2.0
* Updated `run-tests` to print name of the test that is about to run
(for debugging what hangs in automation)
* Disable usage stats in automation
* Temporarily disable ENTRYPOINTS (to be re-enabled in Andy's single whl
PR)
* Updated `run-tests` to consider exit code 5 from pytest to be a pass
(since exit code 5 from pytest means that we did not run any tests)

Note that
vllm-project@8f89d72
is NOT included in this merge.

COMPARE vs UPSTREAM:


https://github.com/neuralmagic/nm-vllm/compare/upstream-sync-2024-06-16..vllm-project:vllm:v0.5.0.post1

---------

Signed-off-by: kevin <[email protected]>
Signed-off-by: Travis Johnson <[email protected]>
Signed-off-by: Wang, Yi A <[email protected]>
Co-authored-by: Michael Goin <[email protected]>
Co-authored-by: Simon Mo <[email protected]>
Co-authored-by: SangBin Cho <[email protected]>
Co-authored-by: Woosuk Kwon <[email protected]>
Co-authored-by: Li, Jiang <[email protected]>
Co-authored-by: Kevin H. Luu <[email protected]>
Co-authored-by: Cody Yu <[email protected]>
Co-authored-by: Arthur Kim <[email protected]>
Co-authored-by: Travis Johnson <[email protected]>
Co-authored-by: Sanger Steel <[email protected]>
Co-authored-by: Roger Wang <[email protected]>
Co-authored-by: youkaichao <[email protected]>
Co-authored-by: Cyrus Leung <[email protected]>
Co-authored-by: Isotr0py <[email protected]>
Co-authored-by: Wang, Yi <[email protected]>
Co-authored-by: Dipika Sikka <[email protected]>
Co-authored-by: wenyujin333 <[email protected]>
Co-authored-by: Jianan Gu <[email protected]>
Co-authored-by: Tyler Michael Smith <[email protected]>
Co-authored-by: zifeitong <[email protected]>
Co-authored-by: Philipp Moritz <[email protected]>
Co-authored-by: Antoni Baum <[email protected]>
Co-authored-by: Jie Fu (傅杰) <[email protected]>
Co-authored-by: Allen.Dou <[email protected]>
  • Loading branch information
1 parent d8da97b commit dd39914
Show file tree
Hide file tree
Showing 116 changed files with 4,433 additions and 1,469 deletions.
1 change: 1 addition & 0 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ steps:
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
- pytest -v -s spec_decode/e2e/test_integration_dist.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py

- label: Distributed Tests (Multiple Groups)
#mirror_hardwares: [amd]
Expand Down
32 changes: 30 additions & 2 deletions .buildkite/test-template-aws.j2
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ steps:
queue: cpu_queue
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
- "docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ."
- "docker push {{ docker_image }}"
env:
DOCKER_BUILDKIT: "1"
Expand All @@ -19,6 +19,34 @@ steps:
limit: 5
- wait

- group: "AMD Tests"
depends_on: ~
steps:
{% for step in steps %}
{% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
- label: "AMD: {{ step.label }}"
agents:
queue: amd
command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}"
env:
DOCKER_BUILDKIT: "1"
soft_fail: true
{% endif %}
{% endfor %}

- label: "Neuron Test"
depends_on: ~
agents:
queue: neuron
command: bash .buildkite/run-neuron-test.sh
soft_fail: false

- label: "Intel Test"
depends_on: ~
agents:
queue: intel
command: bash .buildkite/run-cpu-test.sh

{% for step in steps %}
- label: "{{ step.label }}"
agents:
Expand All @@ -31,7 +59,7 @@ steps:
{% else %}
queue: gpu_1_queue
{% endif %}
soft_fail: true
soft_fail: {{ step.soft_fail or false }}
{% if step.parallelism %}
parallelism: {{ step.parallelism }}
{% endif %}
Expand Down
3 changes: 3 additions & 0 deletions .github/actions/nm-set-env/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ runs:
# testmo
echo "XDG_CONFIG_HOME=/usr/local/apps" >> $GITHUB_ENV
echo "PROJECT_ID=12" >> $GITHUB_ENV
# disable usage stats (writes to protected /usr/local/apps)
echo "VLLM_NO_USAGE_STATS=1" >> $GITHUB_ENV
echo "DO_NOT_TRACK=1" >> $GITHUB_ENV
env:
HF_TOKEN_SECRET: ${{ inputs.hf_token }}
shell: bash
17 changes: 16 additions & 1 deletion .github/scripts/run-tests
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@ do
LOCAL_SUCCESS=0
RESULT_XML=$(echo ${TEST} | sed -e "s/${TEST_DIR}/${RESULTS_DIR}/" | sed -e "s/.py/.xml/")

# report which test is being run
# (in CI, if a test hangs, this logs *which* test is running *before* it hangs)
echo "=== RUNNING TEST: ${TEST} ==="

# this is a bit messy and brittle, but certain tests
# need to be run with specific options
if [[ "${TEST}" == *"kernels"* || "${TEST}" == *"samplers"* ]]; then
Expand All @@ -125,7 +129,18 @@ do
pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
fi

SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
# if a file gets exit code 0, we are good
if [[ $LOCAL_SUCCESS == 0 ]]; then
echo "=== PASSED TEST: ${TEST} ==="
# if a file does not run any tests, pytest reports exit code of 5
# since we skip full modules in our skipping strategy, this is common
elif [[ $LOCAL_SUCCESS == 5 ]]; then
echo "=== SKIPPED TEST: ${TEST} ==="
# otherwise, report failure
else
echo "=== FAILED TEST: ${TEST} ==="
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
fi

done

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/ruff.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2
- name: Analysing the code with ruff
run: |
ruff .
Expand Down
8 changes: 4 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -180,17 +180,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
"csrc/custom_all_reduce.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu")
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")

#
# The CUTLASS kernels for Hopper require sm90a to be enabled.
# This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
# That adds an extra 17MB to compiled binary, so instead we selectively enable it.
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
set_source_files_properties(
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
PROPERTIES
COMPILE_FLAGS
"-gencode arch=compute_90a,code=sm_90a")
Expand Down
6 changes: 4 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
# prepare basic build environment
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev

RUN apt-get update -y && \
apt-get install -y python3-pip git
RUN apt-get update -y \
&& apt-get install -y python3-pip git curl sudo

# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
Expand All @@ -27,6 +27,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-cuda.txt

# install development dependencies
COPY requirements-lint.txt requirements-lint.txt
COPY requirements-test.txt requirements-test.txt
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-dev.txt
Expand Down
8 changes: 6 additions & 2 deletions Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@
FROM ubuntu:22.04 AS cpu-test-1

RUN apt-get update -y \
&& apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
&& apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

RUN echo 'export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD' >> ~/.bashrc

RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl

RUN pip install --upgrade pip \
&& pip install wheel packaging ninja "setuptools>=49.4.0" numpy

Expand All @@ -21,6 +25,6 @@ RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install

WORKDIR /workspace/

RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks

CMD ["/bin/bash"]
19 changes: 19 additions & 0 deletions Dockerfile.tpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
ARG NIGHTLY_DATE="20240601"
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"

FROM $BASE_IMAGE

WORKDIR /workspace
COPY . /workspace/vllm

ENV VLLM_TARGET_DEVICE="tpu"
# Install aiohttp separately to avoid build errors.
RUN pip install aiohttp
# Install the TPU and Pallas dependencies.
RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html

# Build vLLM.
RUN cd /workspace/vllm && python setup.py develop

CMD ["/bin/bash"]
8 changes: 6 additions & 2 deletions benchmarks/backend_request_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,13 @@ async def async_request_tgi(
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk_bytes = chunk_bytes.decode("utf-8")

chunk = remove_prefix(chunk_bytes.decode("utf-8"),
"data:")
#NOTE: Sometimes TGI returns a ping response without
# any data, we should skip it.
if chunk_bytes.startswith(":"):
continue
chunk = remove_prefix(chunk_bytes, "data:")

data = json.loads(chunk)
timestamp = time.perf_counter()
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
"--device",
type=str,
default="cuda",
choices=["cuda", "cpu"],
choices=["cuda", "cpu", "tpu"],
help='device type for vLLM execution, supporting CUDA and CPU.')
parser.add_argument('--block-size',
type=int,
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ def main(args: argparse.Namespace):
"--device",
type=str,
default="cuda",
choices=["cuda", "cpu"],
choices=["cuda", "cpu", "tpu"],
help='device type for vLLM execution, supporting CUDA and CPU.')
parser.add_argument(
"--enable-prefix-caching",
Expand Down
6 changes: 1 addition & 5 deletions benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,7 @@ def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
scale_b: torch.tensor,
out_dtype: torch.dtype) -> torch.tensor:
return ops.cutlass_scaled_mm_dq(a,
b,
scale_a,
scale_b,
out_dtype=out_dtype)
return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype)


# bench
Expand Down
6 changes: 5 additions & 1 deletion cmake/cpu_extension.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ function (find_isa CPUINFO TARGET OUT)
endif()
endfunction()

find_isa(${CPUINFO} "avx2" AVX2_FOUND)
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)

if (AVX512_FOUND)
Expand All @@ -53,8 +54,11 @@ if (AVX512_FOUND)
else()
message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
endif()
elseif (AVX2_FOUND)
list(APPEND CXX_COMPILE_FLAGS "-mavx2")
message(WARNING "vLLM CPU backend using AVX2 ISA")
else()
message(FATAL_ERROR "vLLM CPU backend requires AVX512 ISA support.")
message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 ISA support.")
endif()

message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
Expand Down
Loading

0 comments on commit dd39914

Please sign in to comment.