From 1ae7c6b2ede9eda1e81887ca282121c253631381 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 3 Dec 2024 23:26:54 -0500
Subject: [PATCH] doc cleanup

---
 bitsandbytes/functional.py | 194 +++++++++++++++++++------------------
 docs/source/_toctree.yml   |   2 +-
 2 files changed, 100 insertions(+), 96 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index d1c5d1d2e..a5cc4a9f0 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -483,17 +483,13 @@ def _get_tensor_stream(tensor: Tensor) -> ct.c_void_p:
 
 
 def get_ptr(A: Optional[Tensor]) -> Optional[ct.c_void_p]:
-    """
-    Get the ctypes pointer from a PyTorch Tensor.
+    """Gets the memory address of the first element of a tenso
 
-    Parameters
-    ----------
-    A : torch.tensor
-        The PyTorch tensor.
+    Args:
+        A (`Optional[Tensor]`): A PyTorch tensor.
 
-    Returns
-    -------
-    ctypes.c_void_p
+    Returns:
+        `Optional[ct.c_void_p]`: A pointer to the underlying tensor data.
     """
     if A is None:
         return None
@@ -863,30 +859,31 @@ def quantize_blockwise(
     blocksize=4096,
     nested=False,
 ) -> Tuple[torch.Tensor, QuantState]:
-    """
-    Quantize tensor A in blocks of size 4096 values.
+    """Quantize a tensor in blocks of values.
 
-    Quantizes tensor A by dividing it into blocks of 4096 values.
-    Then the absolute maximum value within these blocks is calculated
-    for the non-linear quantization.
+    The input tensor is quantized by dividing it into blocks of `blocksize` values.
+    The the absolute maximum value within these blocks is calculated for scaling
+    the non-linear quantization.
 
-    Parameters
-    ----------
-    A : torch.Tensor
-        The input tensor.
-    code : torch.Tensor
-        The quantization map.
-    absmax : torch.Tensor
-        The absmax values.
-    out : torch.Tensor
-        The output tensor (8-bit).
+    Args:
+        A (`torch.Tensor`): The input tensor. Supports `float16`, `bfloat16`, or `float32` datatypes.
+        code (`torch.Tensor`, *optional*):
+            A mapping describing the low-bit data type. Defaults to a signed 8-bit dynamic type.
+            For more details, see  (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561].
+        absmax (`torch.Tensor`, *optional*): A tensor to use to store the absmax values.
+        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
+        blocksize (`int`, *optional*):
+            The size of the blocks. Defaults to 4096.
+            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
+        nested (`bool`, *optional*): Whether to additionally quantize the absmax values. Defaults to False.
 
-    Returns
-    -------
-    torch.Tensor:
-        The 8-bit tensor.
-    tuple(torch.Tensor, torch.Tensor):
-        The quantization state to undo the quantization.
+    Raises:
+        ValueError: Raised when the input data type is not supported.
+
+    Returns:
+        `Tuple[torch.Tensor, QuantState]`: A tuple containing the quantization results.
+        - `torch.Tensor`: The quantized tensor.
+        - [`QuantState`]: The state object used to undo the quantization.
     """
 
     if code is None:
@@ -967,31 +964,38 @@ def dequantize_blockwise(
     blocksize: int = 4096,
     nested=False,
 ) -> torch.Tensor:
-    """
-    Dequantizes blockwise quantized values.
+    """Dequantize a tensor in blocks of values.
 
-    Dequantizes the tensor A with maximum absolute values absmax in
-    blocks of size 4096.
+    The input tensor is dequantized by dividing it into blocks of `blocksize` values.
+    The the absolute maximum value within these blocks is used for scaling
+    the non-linear dequantization.
 
-    Parameters
-    ----------
-    A : torch.Tensor
-        The input 8-bit tensor.
-    quant_state : QuantState
-        Object with code, absmax and other quantization state components.
-    absmax : torch.Tensor
-        The absmax values.
-    code : torch.Tensor
-        The quantization map.
-    out : torch.Tensor
-        Dequantized output tensor (default: float32)
+    Args:
+        A (`torch.Tensor`): The quantized input tensor.
+        quant_state ([`QuantState`], *optional*):
+            The quantization state as returned by [`quantize_blockwise`].
+            Required if `absmax` is not provided.
+        absmax (`torch.Tensor`, *optional*):
+            A tensor containing the scaling values.
+            Required if `quant_state` is not provided and ignored otherwise.
+        code (`torch.Tensor`, *optional*):
+            A mapping describing the low-bit data type. Defaults to a signed 8-bit dynamic type.
+            For more details, see  (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561].
+            Ignored when `quant_state` is provided.
+        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
+        blocksize (`int`, *optional*):
+            The size of the blocks. Defaults to 4096.
+            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
+            Ignored when `quant_state` is provided.
 
+    Raises:
+        ValueError: Raised when the input data type is not supported.
 
-    Returns
-    -------
-    torch.Tensor:
-        Dequantized tensor (default: float32)
+    Returns:
+        `torch.Tensor`:
+            The dequantized tensor. The datatype is indicated by `quant_state.dtype` and defaults to `torch.float32`.
     """
+
     assert quant_state is not None or absmax is not None
     if code is None and quant_state is None:
         if "dynamic" not in name2qmap:
@@ -1166,31 +1170,30 @@ def quantize_4bit(
     quant_type="fp4",
     quant_storage=torch.uint8,
 ) -> Tuple[torch.Tensor, QuantState]:
-    """
-    Quantize tensor A in blocks of 4-bit values.
+    """Quantize tensor A in blocks of 4-bit values.
 
-    Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
+    Quantizes tensor A by dividing it into blocks which are independently quantized.
 
-    Parameters
-    ----------
-    A : torch.Tensor
-        The input tensor.
-    absmax : torch.Tensor
-        The absmax values.
-    out : torch.Tensor
-        The output tensor.
-    blocksize : int
-        The blocksize used in quantization.
-    quant_type : str
-        The 4-bit quantization data type {fp4, nf4}
+    Args:
+        A (`torch.Tensor`): The input tensor. Supports `float16`, `bfloat16`, or `float32` datatypes.
+        absmax (`torch.Tensor`, *optional*): A tensor to use to store the absmax values.
+        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
+        blocksize (`int`, *optional*):
+            The size of the blocks. Defaults to 64.
+            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
+        compress_statistics (`bool`, *optional*): Whether to additionally quantize the absmax values. Defaults to False.
+        quant_type (`str`, *optional*): The data type to use: `nf4` or `fp4`. Defaults to `fp4`.
+        quant_storage (`torch.dtype`, *optional*): The dtype of the tensor used to store the result. Defaults to `torch.uint8`.
 
-    Returns
-    -------
-    torch.Tensor:
-        Tensor with packed 4-bit values.
-    tuple(torch.Tensor, torch.Size, torch.dtype, int):
-        The quantization state to undo the quantization.
+    Raises:
+        ValueError: Raised when the input data type is not supported.
+
+    Returns:
+        Tuple[`torch.Tensor`, `QuantState`]: A tuple containing the quantization results.
+        - `torch.Tensor`: The quantized tensor with packed 4-bit values.
+        - [`QuantState`]: The state object used to undo the quantization.
     """
+
     if A.device.type != "cuda":
         raise NotImplementedError(f"Device type not supported for FP4 quantization: {A.device.type}")
     if quant_type not in ["fp4", "nf4"]:
@@ -1297,32 +1300,33 @@ def dequantize_4bit(
     blocksize: int = 64,
     quant_type="fp4",
 ) -> torch.Tensor:
-    """
-    Dequantizes FP4 blockwise quantized values.
+    """Dequantizes a packed 4-bit quantized tensor.
 
-    Dequantizes the tensor A with maximum absolute values absmax in blocks of size blocksize.
+    The input tensor is dequantized by dividing it into blocks of `blocksize` values.
+    The the absolute maximum value within these blocks is used for scaling
+    the non-linear dequantization.
 
-    Parameters
-    ----------
-    A : torch.Tensor
-        The input tensor (packed 4-bit values).
-    quant_state : QuantState
-        object with quantisation stats, incl. absmax values, original tensor shape and original dtype.
-    absmax : torch.Tensor
-        The absmax values.
-    out : torch.Tensor
-        Dequantized output tensor.
-    blocksize : int
-        The blocksize used in quantization.
-    quant_type : str
-        The 4-bit quantization data type {fp4, nf4}
+    Args:
+        A (`torch.Tensor`): The quantized input tensor.
+        quant_state ([`QuantState`], *optional*):
+            The quantization state as returned by [`quantize_4bit`].
+            Required if `absmax` is not provided.
+        absmax (`torch.Tensor`, *optional*):
+            A tensor containing the scaling values.
+            Required if `quant_state` is not provided and ignored otherwise.
+        out (`torch.Tensor`, *optional*): A tensor to use to store the result.
+        blocksize (`int`, *optional*):
+            The size of the blocks. Defaults to 64.
+            Valid values are 64, 128, 256, 512, 1024, 2048, and 4096.
+        quant_type (`str`, *optional*): The data type to use: `nf4` or `fp4`. Defaults to `fp4`.
 
+    Raises:
+        ValueError: Raised when the input data type or blocksize is not supported.
 
-    Returns
-    -------
-    torch.Tensor:
-        Dequantized tensor.
+    Returns:
+        `torch.Tensor`: The dequantized tensor.
     """
+
     if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
         raise ValueError(
             f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]",
@@ -2277,7 +2281,7 @@ def int8_linear_matmul(A: torch.Tensor, B: torch.Tensor, out: Optional[torch.Ten
     Args:
         A (`torch.Tensor`): The first matrix operand with the data type `torch.int8`.
         B (`torch.Tensor`): The second matrix operand with the data type `torch.int8`.
-        out (`torch.Tensor, *optional*): A pre-allocated tensor used to store the result.
+        out (`torch.Tensor`, *optional*): A pre-allocated tensor used to store the result.
         dtype (`torch.dtype`, *optional*): The expected data type of the output. Defaults to `torch.int32`.
 
     Raises:
@@ -2384,7 +2388,7 @@ def int8_mm_dequant(
         A (`torch.Tensor` with dtype `torch.int32`): The result of a quantized int8 matrix multiplication.
         row_stats (`torch.Tensor`): The row-wise quantization statistics for the lhs operand of the matrix multiplication.
         col_stats (`torch.Tensor`): The column-wise quantization statistics for the rhs operand of the matrix multiplication.
-        out (`torch.Tensor], *optional*): A pre-allocated tensor to store the output of the operation.
+        out (`torch.Tensor`, *optional*): A pre-allocated tensor to store the output of the operation.
         bias (`torch.Tensor`, *optional*): An optional bias vector to add to the result.
 
     Returns:
@@ -2454,7 +2458,7 @@ def get_colrow_absmax(
         row_stats (`torch.Tensor`, *optional*): If provided, calculation of row statistics is skipped.
         col_stats (`torch.Tensor`, *optional*): If provided, calculation of column statistics is skipped.
         nnz_block_ptr (`torch.Tensor`, *optional*): Not used.
-        threshold (`float`, `optional`):
+        threshold (`float`, *optional*):
             An optional threshold for sparse decomposition of outlier features.
             No outliers are held back when 0.0. Defaults to 0.0.
 
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 629c6d0f8..5fa353d6d 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -59,7 +59,7 @@
   - title: k-bit quantizers
     sections:
     - local: reference/nn/linear8bit
-      title: 8-bit quantizer
+      title: LLM.int8()
     - local: reference/nn/linear4bit
       title: 4-bit quantizer
     - local: reference/nn/embeddings