Skip to content

Commit

Permalink
Merge commit '4e12131089f192334f6e09c8fe5cd85af1e25327' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
zwd003 committed May 11, 2024
2 parents 5688e58 + 4e12131 commit 2609d43
Show file tree
Hide file tree
Showing 125 changed files with 4,413 additions and 2,123 deletions.
2 changes: 1 addition & 1 deletion .buildkite/check-wheel-size.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import zipfile

MAX_SIZE_MB = 100
MAX_SIZE_MB = 150


def print_top_10_largest_files(zip_file):
Expand Down
2 changes: 1 addition & 1 deletion .buildkite/run-amd-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,5 @@ docker run \
-e HF_TOKEN \
--name ${container_name} \
${container_name} \
/bin/bash -c $(echo $1 | sed "s/^'//" | sed "s/'$//")
/bin/bash -c "${@}"

10 changes: 5 additions & 5 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ steps:
- pytest -v -s test_pynccl.py

- label: Engine Test
mirror_hardwares: [amd]
#mirror_hardwares: [amd]
command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py

- label: Entrypoints Test
Expand All @@ -73,13 +73,13 @@ steps:
parallelism: 4

- label: Models Test
mirror_hardwares: [amd]
#mirror_hardwares: [amd]
commands:
- bash ../.buildkite/download-images.sh
- pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
- pytest -v -s models --ignore=models/test_llava.py

- label: Llava Test
mirror_hardwares: [amd]
#mirror_hardwares: [amd]
commands:
- bash ../.buildkite/download-images.sh
- pytest -v -s models/test_llava.py
Expand All @@ -101,7 +101,7 @@ steps:
command: pytest -v -s worker

- label: Speculative decoding tests
mirror_hardwares: [amd]
#mirror_hardwares: [amd]
command: pytest -v -s spec_decode

- label: LoRA Test %N
Expand Down
6 changes: 5 additions & 1 deletion .buildkite/test-template.j2
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ steps:
automatic:
- exit_status: -1 # Agent was lost
limit: 5
- exit_status: -10 # Agent was lost
limit: 5
- wait

- group: "AMD Tests"
Expand All @@ -24,7 +26,7 @@ steps:
- label: "AMD: {{ step.label }}"
agents:
queue: amd
command: bash .buildkite/run-amd-test.sh "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}"
env:
DOCKER_BUILDKIT: "1"
{% endif %}
Expand Down Expand Up @@ -53,6 +55,8 @@ steps:
automatic:
- exit_status: -1 # Agent was lost
limit: 5
- exit_status: -10 # Agent was lost
limit: 5
plugins:
- kubernetes:
podSpec:
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ jobs:

- name: Setup ccache
uses: hendrikmuhs/[email protected]
with:
create-symlink: true
key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}

- name: Set up Linux Env
if: ${{ runner.os == 'Linux' }}
Expand Down
18 changes: 11 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ set(VLLM_EXT_SRC
"csrc/layernorm_kernels.cu"
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
"csrc/quantization/gptq/q_gemm.cu"
"csrc/quantization/fp8/fp8_cuda_kernels.cu"
"csrc/quantization/fp8/common.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/moe_align_block_size_kernels.cu"
"csrc/pybind.cpp")
Expand Down Expand Up @@ -219,7 +219,8 @@ set(VLLM_PUNICA_EXT_SRC
"csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
"csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
"csrc/punica/punica_ops.cc")
"csrc/punica/punica_ops.cu"
"csrc/punica/punica_pybind.cpp")

#
# Copy GPU compilation flags+update for punica
Expand All @@ -243,6 +244,9 @@ if (${VLLM_GPU_LANG} STREQUAL "CUDA")
endif()
endforeach()
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
elseif(${VLLM_GPU_LANG} STREQUAL "HIP")
set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES})
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
endif()

if (VLLM_PUNICA_GPU_ARCHES)
Expand Down Expand Up @@ -277,11 +281,6 @@ add_custom_target(default)
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
message(STATUS "Enabling C extension.")
add_dependencies(default _C)
endif()

if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Enabling moe extension.")
add_dependencies(default _moe_C)

# Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
# VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
Expand All @@ -292,3 +291,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
add_dependencies(default _punica_C)
endif()
endif()

if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Enabling moe extension.")
add_dependencies(default _moe_C)
endif()
21 changes: 0 additions & 21 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -87,23 +87,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
pip cache remove vllm_nccl*
#################### EXTENSION Build IMAGE ####################

#################### FLASH_ATTENTION Build IMAGE ####################
FROM dev as flash-attn-builder
# max jobs used for build
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
# flash attention version
ARG flash_attn_version=v2.5.8
ENV FLASH_ATTN_VERSION=${flash_attn_version}

WORKDIR /usr/src/flash-attention-v2

# Download the wheel or build it if a pre-compiled release doesn't exist
RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
--no-build-isolation --no-deps --no-cache-dir

#################### FLASH_ATTENTION Build IMAGE ####################

#################### vLLM installation IMAGE ####################
# image with vLLM installed
FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
Expand All @@ -122,10 +105,6 @@ RUN ldconfig /usr/local/cuda-12.4/compat/
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/pip \
pip install dist/*.whl --verbose

RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
--mount=type=cache,target=/root/.cache/pip \
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
#################### vLLM installation IMAGE ####################


Expand Down
3 changes: 3 additions & 0 deletions Dockerfile.rocm
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ COPY . .

RUN python3 -m pip install --upgrade pip numba

# make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1

RUN --mount=type=cache,target=/root/.cache/pip \
pip install -U -r requirements-rocm.txt \
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
Expand Down
4 changes: 2 additions & 2 deletions cmake/utils.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
"Failed to determine torch nvcc compiler flags")

if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
list(APPEND GPU_FLAGS "-DENABLE_FP8_E5M2")
list(APPEND GPU_FLAGS "-DENABLE_FP8")
endif()
if (CUDA_VERSION VERSION_GREATER_EQUAL 12.0)
list(REMOVE_ITEM GPU_FLAGS
Expand All @@ -119,7 +119,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)

list(APPEND GPU_FLAGS
"-DUSE_ROCM"
"-DENABLE_FP8_E4M3"
"-DENABLE_FP8"
"-U__HIP_NO_HALF_CONVERSIONS__"
"-U__HIP_NO_HALF_OPERATORS__"
"-fno-gpu-rdc")
Expand Down
Loading

0 comments on commit 2609d43

Please sign in to comment.