diff --git a/CMakeLists.txt b/CMakeLists.txt index 246086e58..f0f1aa869 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -240,26 +240,26 @@ elseif(BUILD_MPS) add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib") elseif(BUILD_NPU) list(APPEND SRC_FILES ${NPU_FILES}) - + set(SOC_VERSION "Ascend910B4" CACHE STRING "system on chip type") - set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_HOME_PATH} CACHE - STRING "ASCEND CANN package installation directory" + set(ASCEND_CAN_PACKAGE_PATH $ENV{ASCEND_HOME_PATH} CACHE + STRING "ASCEND CAN package installation directory" ) - + # ${KERNEL_FILES} are used to compile library, push files written by ascendc in ${KERNEL_FILES}. # ref to cmake/npu.cmake ascendc_library, cmake/cpu.cmake add_library # file(GLOB KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/csrc/npu_kernels.cpp) file(GLOB KERNEL_FILES csrc/npu_kernels.cpp) - - if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) - set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) - elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) - set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + + if(EXISTS ${ASCEND_CAN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CAN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + elseif(EXISTS ${ASCEND_CAN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CAN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) else() - message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the can package is installed") endif() include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) - + # ascendc_library use to add kernel file to generate ascendc library ascendc_library(ascendc_kernels_npu STATIC ${KERNEL_FILES}) diff --git a/_typos.toml b/_typos.toml index e4e7287fb..8ad7d5969 100644 --- a/_typos.toml +++ b/_typos.toml @@ -9,6 +9,7 @@ extend-ignore-re = [ [type.py.extend-words] "BA" = "BA" # used as a commented-out variable in tests +"cann" = "cann" # cann (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for Ascend NPU [type.cuda.extend-words] "subtile" = "subtile" diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py index b2a3a49e3..8fdf7569d 100644 --- a/bitsandbytes/backends/cpu_xpu_common.py +++ b/bitsandbytes/backends/cpu_xpu_common.py @@ -23,7 +23,7 @@ gxx_available = False try: - subprocess.run(["g++", "--version"], capture_output=True) # hide terminal output + subprocess.run(["g++", "--version"], capture_output=True) # hide terminal output gxx_available = True except BaseException: warnings.warn("g++ not found, torch.compile disabled for CPU/XPU.") diff --git a/bitsandbytes/backends/cuda.py b/bitsandbytes/backends/cuda.py index dfb639cbd..ad478431c 100644 --- a/bitsandbytes/backends/cuda.py +++ b/bitsandbytes/backends/cuda.py @@ -78,7 +78,6 @@ class CUDABackend(Backend): - def double_quant( self, A: torch.Tensor, diff --git a/bitsandbytes/backends/mps.py b/bitsandbytes/backends/mps.py index 6391e818a..5b7eda0c7 100644 --- a/bitsandbytes/backends/mps.py +++ b/bitsandbytes/backends/mps.py @@ -8,7 +8,6 @@ class MPSBackend(Backend): - def double_quant( self, A: torch.Tensor, diff --git a/bitsandbytes/backends/npu.py b/bitsandbytes/backends/npu.py index 5457563a5..8cdd6d10b 100644 --- a/bitsandbytes/backends/npu.py +++ b/bitsandbytes/backends/npu.py @@ -29,7 +29,6 @@ def assert_on_npu(tensors): class NPUBackend(Backend): - def double_quant( self, A: torch.Tensor, @@ -107,21 +106,21 @@ def quantize_4bit( torch.npu.set_device(A.device) if A.dtype in [torch.float32, torch.float16, torch.bfloat16]: data = [ - -1.0, - -0.6961928009986877, + -1.0, + -0.6961928009986877, -0.5250730514526367, -0.39491748809814453, -0.28444138169288635, -0.18477343022823334, - -0.09105003625154495, - 0.0, - 0.07958029955625534, - 0.16093020141124725, - 0.24611230194568634, - 0.33791524171829224, - 0.44070982933044434, - 0.5626170039176941, - 0.7229568362236023, + -0.09105003625154495, + 0.0, + 0.07958029955625534, + 0.16093020141124725, + 0.24611230194568634, + 0.33791524171829224, + 0.44070982933044434, + 0.5626170039176941, + 0.7229568362236023, 1.0, ] data = torch.tensor(data, device="npu", dtype=torch.float32).view(1, -1) @@ -132,10 +131,10 @@ def quantize_4bit( out = out.reshape(-1, 2) out = (out[:, 0] + out[:, 1] * 16).to(torch.uint8) else: - raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}") + raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}") assert_on_npu([A, absmax, out]) torch.npu.set_device(prev_device) - + code = get_4bit_type(quant_type, device=A.device) state = QuantState( absmax=absmax, @@ -164,7 +163,7 @@ def dequantize_4bit( raise ValueError( f"The blockwise of {blocksize} is not supported. Supported values: {supported_blocksizes}" ) - + if quant_state is None: assert absmax is not None and out is not None quant_state = QuantState( @@ -192,7 +191,7 @@ def dequantize_4bit( get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n), - torch.npu.current_stream(), + torch.npu.current_stream(), ) elif out.dtype == torch.float16: lib.cdequantize_blockwise_fp16_nf4( @@ -201,7 +200,7 @@ def dequantize_4bit( get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n), - torch.npu.current_stream(), + torch.npu.current_stream(), ) elif out.dtype == torch.bfloat16: # bf16: bf16 -> fp32 -> op -> fp32 -> bf16 @@ -213,7 +212,7 @@ def dequantize_4bit( get_ptr(out), ct.c_int(quant_state.blocksize), ct.c_int(n), - torch.npu.current_stream() + torch.npu.current_stream(), ) out = out.to(torch.bfloat16) else: diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py index 6dc5fb63b..ec329cbb6 100644 --- a/bitsandbytes/cextension.py +++ b/bitsandbytes/cextension.py @@ -25,7 +25,7 @@ from bitsandbytes.consts import DYNAMIC_LIBRARY_SUFFIX, PACKAGE_DIR from bitsandbytes.cuda_specs import CUDASpecs, get_cuda_specs, get_rocm_gpu_arch -from bitsandbytes.npu_specs import get_npu_specs +from bitsandbytes.npu_specs import get_npu_specs logger = logging.getLogger(__name__) diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py index bd0535346..0ae65dab2 100755 --- a/bitsandbytes/nn/modules.py +++ b/bitsandbytes/nn/modules.py @@ -313,7 +313,7 @@ def cuda(self, device: Optional[Union[int, device, str]] = None, non_blocking: b def cpu(self, non_blocking: bool = False): return self.to(device="cpu", non_blocking=non_blocking) - + def npu(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False): # `torch.Tensor.to()` is not supported by `torch_npu` (see this [issue](https://github.com/Ascend/pytorch/issues/16)). if isinstance(device, int): diff --git a/setup.py b/setup.py index eecd2e7ff..e8d3f547c 100644 --- a/setup.py +++ b/setup.py @@ -47,6 +47,7 @@ def get_version_and_write_to_file(): def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() + # Tested with wheel v0.29.0 class BinaryDistribution(Distribution): def has_ext_modules(self):