Add xpu support for int8

bitsandbytes-foundation · Jun 27, 2024 · 555a139 · 555a139
1 parent 7f43430
commit 555a139
Show file tree

Hide file tree

Showing 5 changed files with 10 additions and 4 deletions.
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
@@ -221,7 +221,7 @@ def backward(ctx, grad_output):
 
 def supports_igemmlt(device: torch.device) -> bool:
     """check if this device supports the optimized int8 kernel"""
-    if device == torch.device("cpu"):
+    if device == torch.device("cpu") or torch.device("xpu"):
         return True
     if torch.version.hip:
         return False if BNB_HIP_VERSION < 601 else True
@@ -321,7 +321,7 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
 
         # Cast A to fp16
         A_dtype = torch.float16
-        if A.device == torch.device("cpu"):
+        if A.device == torch.device("cpu") or torch.device("xpu"):
             A_dtype = torch.bfloat16
         if A.dtype != A_dtype:
             warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to {A_dtype} during quantization")

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
@@ -516,6 +516,7 @@ def gemm_4bit_impl(
     if ipex_cpu and _ipex_cpu_version_prereq(2, 3) and hasattr(state, "op_context"):
         assert state.op_context is not None
         output = torch.ops.torch_ipex.ipex_woq_linear(A, state.op_context.get_data_handle())
+    # TODO: Support XPU optimization path
     else:
         dqB = dequantize_4bit_impl(B, state, blocksize=state.blocksize)
         output = torch.matmul(A, dqB)

diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
@@ -30,6 +30,9 @@ def assert_on_xpu(tensors):
 
 
 class XPUBackend(Backend):
+    mm_dequant_compute_dtype = torch.bfloat16
+    mm_dequant_output_dtype = torch.bfloat16
+
     def double_quant(
         self,
         A: torch.Tensor,

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -1771,7 +1771,7 @@ class COOSparseTensor:
     def __init__(self, rows, cols, nnz, rowidx, colidx, values):
         assert rowidx.dtype == torch.int32
         assert colidx.dtype == torch.int32
-        if values.device == torch.device("cpu"):
+        if values.device == torch.device("cpu") or torch.device("xpu"):
             assert values.dtype in [torch.bfloat16, torch.half, torch.float]
         else:
             assert values.dtype == torch.float16

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
@@ -610,7 +610,7 @@ def cpu(self):
 
     def xpu(self):
         # we store the 8-bit rows-major weight
-        B = self.data.contiguous().bfloat16().cpu()
+        B = self.data.contiguous().bfloat16().xpu()
         CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
         if CBt is not None:
             del CBt
@@ -642,6 +642,8 @@ def to(self, *args, **kwargs):
             return self.cuda(device)
         elif device.type == "cpu" and self.data.dtype != torch.int8:
             return self.cpu()
+        elif device.type == "xpu" and self.data.dtype != torch.int8:
+            return self.xpu()
         else:
             new_param = Int8Params(
                 super().to(device=device, dtype=dtype, non_blocking=non_blocking),