From e820409c095ea7cbb5ce156992307b84352cbf90 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Thu, 22 Feb 2024 12:16:40 -0500
Subject: [PATCH 01/13] (docs) Clarify Python and CUDA Toolkit version
 requirement (#1076)

(misc) Update CUDA download URLs
---
 bitsandbytes/cuda_setup/main.py |  4 ++--
 docs/source/installation.mdx    |  8 ++++----
 install_cuda.py                 | 15 +++++----------
 install_cuda.sh                 | 32 ++++++++------------------------
 4 files changed, 19 insertions(+), 40 deletions(-)
diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py
index 14c7abbd8..cd0d94cd7 100644
--- a/bitsandbytes/cuda_setup/main.py
+++ b/bitsandbytes/cuda_setup/main.py
@@ -161,7 +161,7 @@ def run_cuda_setup(self):
                     self.add_log_entry('3. CUDA not installed')
                     self.add_log_entry('4. You have multiple conflicting CUDA libraries')
                     self.add_log_entry('5. Required library not pre-compiled for this bitsandbytes release!')
-                    self.add_log_entry('CUDA SETUP: If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION` for example, `make CUDA_VERSION=113`.')
+                    self.add_log_entry('CUDA SETUP: If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION` for example, `make CUDA_VERSION=118`.')
                     self.add_log_entry('CUDA SETUP: The CUDA version for the compile might depend on your conda install. Inspect CUDA version via `conda list | grep cuda`.')
                     self.add_log_entry('='*80)
                     self.add_log_entry('')
@@ -268,7 +268,7 @@ def warn_in_case_of_duplicates(results_paths: Set[Path]) -> None:
             "BNB_CUDA_VERSION=122 python ..."
             "OR set the environmental variable in your .bashrc: export BNB_CUDA_VERSION=122"
             "In the case of a manual override, make sure you set the LD_LIBRARY_PATH, e.g."
-            "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.2")
+            "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.2")
         CUDASetup.get_instance().add_log_entry(warning_msg, is_warning=True)
 
 
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index f055e44f0..c6d1f27ca 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -1,6 +1,6 @@
 # Installation
 
-bitsandbytes is only supported on CUDA GPUs for CUDA versions **10.2 - 12.0**. Select your operating system below to see the installation instructions.
+bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.3**. Select your operating system below to see the installation instructions.
 
 <hfoptions id="OS system">
 <hfoption id="Linux">
@@ -21,7 +21,7 @@ To install from PyPI.
 pip install bitsandbytes
 ```
 
-To compile from source, you need CMake >= **3.22.1** and Python >= **3.10** installed. Make sure you have a compiler installed to compile C++ (gcc, make, headers, etc.). For example, to install a compiler and CMake on Ubuntu:
+To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. Make sure you have a compiler installed to compile C++ (gcc, make, headers, etc.). For example, to install a compiler and CMake on Ubuntu:
 
 ```bash
 apt-get install -y build-essential cmake
@@ -47,7 +47,7 @@ pip install .
 
 Windows systems require Visual Studio with C++ support as well as an installation of the CUDA SDK.
 
-You'll need to build bitsandbytes from source. To compile from source, you need CMake >= **3.22.1** and Python >= **3.10** installed. You should also install CUDA Toolkit by following the [CUDA Installation Guide for Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html) guide from NVIDIA.
+You'll need to build bitsandbytes from source. To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. You should also install CUDA Toolkit by following the [CUDA Installation Guide for Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html) guide from NVIDIA.
 
 ```bash
 git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
@@ -82,7 +82,7 @@ Then locally install the CUDA version you need with this script from bitsandbyte
 ```bash
 wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
-#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122}
+#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123}
 #   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
 
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
diff --git a/install_cuda.py b/install_cuda.py
index 4b041b8d0..b41b33b39 100644
--- a/install_cuda.py
+++ b/install_cuda.py
@@ -4,10 +4,6 @@
 from urllib.request import urlretrieve
 
 cuda_versions = {
-    "92": "https://developer.nvidia.com/compute/cuda/9.2/Prod2/local_installers/cuda_9.2.148_396.37_linux",
-    "100": "https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux",
-    "101": "https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.105_418.39_linux.run",
-    "102": "https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run",
     "110": "https://developer.download.nvidia.com/compute/cuda/11.0.3/local_installers/cuda_11.0.3_450.51.06_linux.run",
     "111": "https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda_11.1.1_455.32.00_linux.run",
     "112": "https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run",
@@ -15,15 +11,14 @@
     "114": "https://developer.download.nvidia.com/compute/cuda/11.4.4/local_installers/cuda_11.4.4_470.82.01_linux.run",
     "115": "https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run",
     "116": "https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run",
-    "117": "https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda_11.7.0_515.43.04_linux.run",
+    "117": "https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_linux.run",
     "118": "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run",
-    "120": "https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda_12.0.0_525.60.13_linux.run",
-    "121": "https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run",
-    "122": "https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run",
-    "123": "https://developer.download.nvidia.com/compute/cuda/12.3.1/local_installers/cuda_12.3.1_545.23.08_linux.run",
+    "120": "https://developer.download.nvidia.com/compute/cuda/12.0.1/local_installers/cuda_12.0.1_525.85.12_linux.run",
+    "121": "https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run",
+    "122": "https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run",
+    "123": "https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_545.23.08_linux.run",
 }
 
-
 def install_cuda(version, base_path, download_path):
     formatted_version = f"{version[:-1]}.{version[-1]}"
     folder = f"cuda-{formatted_version}"
diff --git a/install_cuda.sh b/install_cuda.sh
index 70263da15..8ffbc8478 100644
--- a/install_cuda.sh
+++ b/install_cuda.sh
@@ -1,7 +1,3 @@
-URL92=https://developer.nvidia.com/compute/cuda/9.2/Prod2/local_installers/cuda_9.2.148_396.37_linux
-URL100=https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux
-URL101=https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.105_418.39_linux.run
-URL102=https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run
 URL110=https://developer.download.nvidia.com/compute/cuda/11.0.3/local_installers/cuda_11.0.3_450.51.06_linux.run
 URL111=https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda_11.1.1_455.32.00_linux.run
 URL112=https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run
@@ -9,12 +5,12 @@ URL113=https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installer
 URL114=https://developer.download.nvidia.com/compute/cuda/11.4.4/local_installers/cuda_11.4.4_470.82.01_linux.run
 URL115=https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run
 URL116=https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run
-URL117=https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda_11.7.0_515.43.04_linux.run
+URL117=https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_linux.run
 URL118=https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
-URL120=https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda_12.0.0_525.60.13_linux.run
+URL120=https://developer.download.nvidia.com/compute/cuda/12.0.1/local_installers/cuda_12.0.1_525.85.12_linux.run
 URL121=https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run
-URL122=https://developer.download.nvidia.com/compute/cuda/12.2.1/local_installers/cuda_12.2.1_535.86.10_linux.run
-URL123=https://developer.download.nvidia.com/compute/cuda/12.3.1/local_installers/cuda_12.3.1_545.23.08_linux.run
+URL122=https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run
+URL123=https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_545.23.08_linux.run
 
 
 CUDA_VERSION=$1
@@ -22,28 +18,16 @@ BASE_PATH=$2
 EXPORT_BASHRC=$3
 
 if [[ -n "$CUDA_VERSION" ]]; then
-  if   [[ "$CUDA_VERSION" -eq "92" ]]; then
-    URL=$URL92
-    FOLDER=cuda-9.2
-  elif   [[ "$CUDA_VERSION" -eq "100" ]]; then
-    URL=$URL100
-    FOLDER=cuda-10.0
-  elif   [[ "$CUDA_VERSION" -eq "101" ]]; then
-    URL=$URL101
-    FOLDER=cuda-10.1
-  elif   [[ "$CUDA_VERSION" -eq "102" ]]; then
-    URL=$URL102
-    FOLDER=cuda-10.2
-  elif   [[ "$CUDA_VERSION" -eq "110" ]]; then
+  if   [[ "$CUDA_VERSION" -eq "110" ]]; then
     URL=$URL110
     FOLDER=cuda-11.0
-  elif   [[ "$CUDA_VERSION" -eq "111" ]]; then
+  elif [[ "$CUDA_VERSION" -eq "111" ]]; then
     URL=$URL111
     FOLDER=cuda-11.1
-  elif   [[ "$CUDA_VERSION" -eq "112" ]]; then
+  elif [[ "$CUDA_VERSION" -eq "112" ]]; then
     URL=$URL112
     FOLDER=cuda-11.2
-  elif   [[ "$CUDA_VERSION" -eq "113" ]]; then
+  elif [[ "$CUDA_VERSION" -eq "113" ]]; then
     URL=$URL113
     FOLDER=cuda-11.3
   elif [[ "$CUDA_VERSION" -eq "114" ]]; then

From 1626374d318c1e5253bfeb8ec9ef80473a807d65 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 23 Feb 2024 09:44:25 +0000
Subject: [PATCH 02/13] upgrade pre-commit config

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index edcbc9b6b..c8ccfe8df 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -18,6 +18,6 @@ repos:
         args:
           - --fix=lf
   - repo: https://github.com/crate-ci/typos
-    rev: v1.17.2
+    rev: v1.18.2
     hooks:
       - id: typos

From 5d6dfe6fb43e5aae277ec86cba20a002b34df705 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 23 Feb 2024 09:46:11 +0000
Subject: [PATCH 03/13] fix newly found typo due to upgraded typos pkg

---
 csrc/kernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/kernels.cu b/csrc/kernels.cu
index 6d15dbe64..f4673359b 100644
--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
@@ -3075,7 +3075,7 @@ template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *
 //// 4. do dequantization from register of B into second pair of registers
 //// 5. store (4) into fragment
 //// 6. matmul aggregate into fragment C
-//// 7. aggreecate files of C into shared memory block C
+//// 7. aggregate files of C into shared memory block C
 //// 8. sum (7)
 //// 9. write outputs to matmul output matrix
 //}

From 1f36bd4cf24d221e61cf2609b7c6170e955222bf Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Mon, 26 Feb 2024 16:12:46 +0100
Subject: [PATCH 04/13] docs: fix link text

---
 docs/source/integrations.mdx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/integrations.mdx b/docs/source/integrations.mdx
index 0e37765c5..bcba6e5e5 100644
--- a/docs/source/integrations.mdx
+++ b/docs/source/integrations.mdx
@@ -2,7 +2,7 @@
 
 With Transformers it's very easy to load any model in 4 or 8-bit, quantizing them on the fly with bitsandbytes primitives.
 
-Please review the [bitsandbytes section in the Accelerate docs](https://huggingface.co/docs/transformers/v4.37.2/en/quantization#bitsandbytes).
+Please review the [bitsandbytes section in the Transformers docs](https://huggingface.co/docs/transformers/v4.37.2/en/quantization#bitsandbytes).
 
 Details about the BitsAndBytesConfig can be found [here](https://huggingface.co/docs/transformers/v4.37.2/en/main_classes/quantization#transformers.BitsAndBytesConfig).
 
@@ -21,7 +21,7 @@ quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dty
 # PEFT
 With `PEFT`, you can use QLoRA out of the box with `LoraConfig` and a 4-bit base model.
 
-Please review the [bitsandbytes section in the Accelerate docs](https://huggingface.co/docs/peft/developer_guides/quantization#quantize-a-model).
+Please review the [bitsandbytes section in the PEFT docs](https://huggingface.co/docs/peft/developer_guides/quantization#quantize-a-model).
 
 # Accelerate
 

From a03df4325dfa8e25f9780d1b854870d85a972898 Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Mon, 26 Feb 2024 13:42:23 -0600
Subject: [PATCH 05/13] Lit-GPT integration docs (#1089)

* lit-gpt integration

* mention PT lightning
---
 docs/source/integrations.mdx | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/docs/source/integrations.mdx b/docs/source/integrations.mdx
index bcba6e5e5..67d50d6a0 100644
--- a/docs/source/integrations.mdx
+++ b/docs/source/integrations.mdx
@@ -29,6 +29,25 @@ Bitsandbytes is also easily usable from within Accelerate.
 
 Please review the [bitsandbytes section in the Accelerate docs](https://huggingface.co/docs/accelerate/en/usage_guides/quantization).
 
+
+
+# PyTorch Lightning and Lightning Fabric
+
+Bitsandbytes is available from within both
+- [PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/), a deep learning framework for professional AI researchers and machine learning engineers who need maximal flexibility without sacrificing performance at scale;
+-  and [Lightning Fabric](https://lightning.ai/docs/fabric/stable/), a fast and lightweight way to scale PyTorch models without boilerplate).
+
+Please review the [bitsandbytes section in the PyTorch Lightning docs](https://lightning.ai/docs/pytorch/stable/common/precision_intermediate.html#quantization-via-bitsandbytes).
+
+
+# Lit-GPT
+
+Bitsandbytes is integrated into [Lit-GPT](https://github.com/Lightning-AI/lit-gpt), a hackable implementation of state-of-the-art open-source large language models, based on Lightning Fabric, where it can be used for quantization during training, finetuning, and inference.
+
+Please review the [bitsandbytes section in the Lit-GPT quantization docs](https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md).
+
+
+
 # Trainer for the optimizers
 
 You can use any of the 8-bit and/or paged optimizers by simple passing them to the `transformers.Trainer` class on initialization.All bnb optimizers are supported by passing the correct string in `TrainingArguments`'s `optim` attribute - e.g. (`paged_adamw_32bit`).

From 433275e3791122a21900474ae0eac8150ec344d2 Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 27 Feb 2024 10:39:04 +0100
Subject: [PATCH 06/13] improve accelerate reference in docs (#1086)

* improve accelerate reference in docs

* Apply suggestions from code review

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* fix spelling

---------

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
---
 docs/source/integrations.mdx | 40 ++++++++++++++++++++++++++++++------
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/docs/source/integrations.mdx b/docs/source/integrations.mdx
index 67d50d6a0..48b4d6060 100644
--- a/docs/source/integrations.mdx
+++ b/docs/source/integrations.mdx
@@ -1,8 +1,8 @@
 # Transformers
 
-With Transformers it's very easy to load any model in 4 or 8-bit, quantizing them on the fly with bitsandbytes primitives.
+With Transformers it's very easy to load any model in 4 or 8-bit, quantizing them on the fly with `bitsandbytes` primitives.
 
-Please review the [bitsandbytes section in the Transformers docs](https://huggingface.co/docs/transformers/v4.37.2/en/quantization#bitsandbytes).
+Please review the [`bitsandbytes` section in the Transformers docs](https://huggingface.co/docs/transformers/main/en/quantization#bitsandbytes).
 
 Details about the BitsAndBytesConfig can be found [here](https://huggingface.co/docs/transformers/v4.37.2/en/main_classes/quantization#transformers.BitsAndBytesConfig).
 
@@ -25,9 +25,37 @@ Please review the [bitsandbytes section in the PEFT docs](https://huggingface.co
 
 # Accelerate
 
-Bitsandbytes is also easily usable from within Accelerate.
+Bitsandbytes is also easily usable from within Accelerate, where you can quantize any PyTorch model simply by passing a quantization config; e.g:
 
-Please review the [bitsandbytes section in the Accelerate docs](https://huggingface.co/docs/accelerate/en/usage_guides/quantization).
+```py
+from accelerate import init_empty_weights
+from accelerate.utils import BnbQuantizationConfig, load_and_quantize_model
+from mingpt.model import GPT
+
+model_config = GPT.get_default_config()
+model_config.model_type = 'gpt2-xl'
+model_config.vocab_size = 50257
+model_config.block_size = 1024
+
+with init_empty_weights():
+    empty_model = GPT(model_config)
+
+bnb_quantization_config = BnbQuantizationConfig(
+  load_in_4bit=True,
+  bnb_4bit_compute_dtype=torch.bfloat16,  # optional
+  bnb_4bit_use_double_quant=True,         # optional
+  bnb_4bit_quant_type="nf4"               # optional
+)
+
+quantized_model = load_and_quantize_model(
+  empty_model,
+  weights_location=weights_location,
+  bnb_quantization_config=bnb_quantization_config,
+  device_map = "auto"
+)
+```
+
+For further details, e.g. model saving, cpu-offloading andfine-tuning, please review the [`bitsandbytes` section in the Accelerate docs](https://huggingface.co/docs/accelerate/en/usage_guides/quantization).
 
 
 
@@ -59,5 +87,5 @@ e.g. for transformers state that you can load any model in 8-bit / 4-bit precisi
 
 # Blog posts
 
-- [Making LLMs even more accessible with bitsandbytes, 4-bit quantization and QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes)
-- [A Gentle Introduction to 8-bit Matrix Multiplication for transformers at scale using Hugging Face Transformers, Accelerate and bitsandbytes](https://huggingface.co/blog/hf-bitsandbytes-integration)
+- [Making LLMs even more accessible with `bitsandbytes`, 4-bit quantization and QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes)
+- [A Gentle Introduction to 8-bit Matrix Multiplication for transformers at scale using Hugging Face Transformers, Accelerate and `bitsandbytes`](https://huggingface.co/blog/hf-bitsandbytes-integration)

From 753df25c7fed6683b7d4562319849192ec8d9873 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 27 Feb 2024 12:24:20 -0500
Subject: [PATCH 07/13] (cmake) Fix cuda arch selection (#1091)

* (cmake) Fix generation of targets for nvcc

* Typo

* (ci) linux + CUDA workflow: make sure we specify target architectures

* fix

* fix one more time

* (cmake) Default in CMAKE_CUDA_ARCHITECTURES_ALL when cmake<3.23, make sure we build only selected cubins and only ptx for latest capability

* Fix static lookup for CMAKE_CUDA_ARCHITECTURES_ALL on cmake<3.23

* Remove debug setting

* clarification
---
 .github/workflows/python-package.yml |  2 +-
 CMakeLists.txt                       | 44 ++++++++++++++++++++++++----
 2 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index e48c25cc5..faa30ca30 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -125,7 +125,7 @@ jobs:
             docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \
               "apt-get update \
               && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
-              && cmake -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} . \
+              && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"50;52;60;61;70;75;80;86;89;90\" -DNO_CUBLASLT=${NO_CUBLASLT} . \
               && cmake --build ."
           else
             cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S .
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b9f1854b..7f70a089e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,7 +33,7 @@ endif()
 
 set(BNB_OUTPUT_NAME "bitsandbytes")
 
-message(STATUS "Building with backend ${COMPUTE_BACKEND}")
+message(STATUS "Configuring ${PROJECT_NAME} (Backend: ${COMPUTE_BACKEND})")
 
 if(${COMPUTE_BACKEND} STREQUAL "cuda")
     if(APPLE)
@@ -82,6 +82,31 @@ if(BUILD_CUDA)
         message(FATAL_ERROR "CUDA Version > 12 is not supported")
     endif()
 
+    # CMake < 3.23.0 does not define CMAKE_CUDA_ARCHITECTURES_ALL.
+    if(CMAKE_VERSION VERSION_LESS "3.23.0")
+        message(STATUS "CMake < 3.23.0; determining CUDA architectures supported...")
+
+        # 11.x and 12.x both support these at a minimum.
+        set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80)
+        set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80)
+
+        # CUDA 11.1 adds Ampere support for GA102-GA107.
+        if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.1")
+            list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 86)
+        endif()
+
+        # CUDA 11.4 adds Ampere support for GA10B.
+        if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.4")
+            list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 87)
+        endif()
+
+        # CUDA 11.8 adds support for Ada and Hopper.
+        if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8")
+            list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 89 90)
+            list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 90)
+        endif()
+    endif()
+
     string(APPEND CMAKE_CUDA_FLAGS " --use_fast_math")
     if(PTXAS_VERBOSE)
         # Verbose? Outputs register usage information, and other things...
@@ -103,10 +128,18 @@ if(BUILD_CUDA)
     message(STATUS "CUDA Capabilities Available: ${POSSIBLE_CAPABILITIES}")
     message(STATUS "CUDA Capabilities  Selected: ${COMPUTE_CAPABILITY}")
 
-    foreach(capability ${COMPUTE_CAPABILITY})
-        string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_${capability},code=sm_${capability}")
-    endforeach()
-
+    # Use the "real" option to build native cubin for all selections.
+    # Ensure we build the PTX for the latest version.
+    # This behavior of adding a PTX (virtual) target for the highest architecture
+    # is similar to how the "all" and "all-major" options would behave in CMake >= 3.23.
+    # TODO: Consider bumping CMake requirement and using CMAKE_CUDA_ARCHITECTURES=[all | native] by default
+    list(REMOVE_DUPLICATES COMPUTE_CAPABILITY)
+    list(SORT COMPUTE_CAPABILITY COMPARE NATURAL)
+    list(POP_BACK COMPUTE_CAPABILITY _LATEST_CAPABILITY)
+    list(TRANSFORM COMPUTE_CAPABILITY APPEND "-real" OUTPUT_VARIABLE CMAKE_CUDA_ARCHITECTURES)
+    list(APPEND CMAKE_CUDA_ARCHITECTURES ${_LATEST_CAPABILITY})
+
+    message(STATUS "CUDA Targets: ${CMAKE_CUDA_ARCHITECTURES}")
     message(STATUS "CUDA NVCC Flags: ${CMAKE_CUDA_FLAGS}")
 
     list(APPEND SRC_FILES ${CUDA_FILES})
@@ -149,7 +182,6 @@ endif()
 # Weird MSVC hacks
 if(MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2 /fp:fast")
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2 /fp:fast")
 endif()
 
 set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX)

From cc5f8cd8b9f6f97f30b85322780359851ee2caf1 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 27 Feb 2024 13:03:42 -0500
Subject: [PATCH 08/13] (cmake) Update library output directory (#1080)

---
 CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7f70a089e..62ff4e535 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -214,10 +214,10 @@ if(WIN32)
 endif()
 set_target_properties(bitsandbytes PROPERTIES OUTPUT_NAME ${BNB_OUTPUT_NAME})
 if(MSVC)
-    set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE bitsandbytes)
-    set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG bitsandbytes)
-    set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE bitsandbytes)
-    set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG bitsandbytes)
+    set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes")
+    set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes")
+    set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes")
+    set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes")
 endif()
 
 set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY bitsandbytes)

From 4b232edf8c923dab4a0059a449cd3dab3201e4d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Rodr=C3=ADguez=20Salamanca?=
 <alexrs95@gmail.com>
Date: Tue, 27 Feb 2024 19:46:20 +0100
Subject: [PATCH 09/13] Fix example int8_inference_huggingface.py (#414)

* Fix example int8_inference_huggingface.py

* Update examples/int8_inference_huggingface.py

Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>

---------

Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
---
 examples/int8_inference_huggingface.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/int8_inference_huggingface.py b/examples/int8_inference_huggingface.py
index 2cee48e8e..c89ba8d11 100644
--- a/examples/int8_inference_huggingface.py
+++ b/examples/int8_inference_huggingface.py
@@ -1,24 +1,24 @@
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import LlamaForCausalLM, LlamaTokenizer
 
 MAX_NEW_TOKENS = 128
-model_name = 'decapoda-research/llama-7b-hf'
+model_name = 'meta-llama/Llama-2-7b-hf'
 
 text = 'Hamburg is in which country?\n'
-tokenizer = AutoTokenizer.from_pretrained(model_name)
+tokenizer = LlamaTokenizer.from_pretrained(model_name)
 input_ids = tokenizer(text, return_tensors="pt").input_ids
 
-free_in_GB = int(torch.cuda.mem_get_info()[0]/1024**3)
 max_memory = f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB'
 
 n_gpus = torch.cuda.device_count()
 max_memory = {i: max_memory for i in range(n_gpus)}
 
-model = AutoModelForCausalLM.from_pretrained(
+model = LlamaForCausalLM.from_pretrained(
   model_name,
   device_map='auto',
   load_in_8bit=True,
   max_memory=max_memory
 )
+
 generated_ids = model.generate(input_ids, max_length=MAX_NEW_TOKENS)
 print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

From 1d709aadef94c35ff8a403ab1a65f7343c011074 Mon Sep 17 00:00:00 2001
From: Rickard <rickardp@users.noreply.github.com>
Date: Tue, 27 Feb 2024 20:15:02 +0100
Subject: [PATCH 10/13] Add concurrency to not waste precious build minutes
 when modifying PRs frequently. (#1051)

Co-authored-by: wkpark <wkpark@gmail.com>
---
 .github/workflows/python-package.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index faa30ca30..c85cd063d 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -15,10 +15,13 @@ on:
       - 'setup.py'
       - 'pyproject.toml'
       - 'pytest.ini'
-      - '**/*.md'
   release:
     types: [ published ]
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
 
   ##

From 0488566462c24f5016ca76f698ef9d4d95d00b11 Mon Sep 17 00:00:00 2001
From: Won-Kyu Park <wkpark@gmail.com>
Date: Wed, 28 Feb 2024 04:22:37 +0900
Subject: [PATCH 11/13] fix cudart*dll for Windows (#1064)

---
 bitsandbytes/cuda_setup/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py
index cd0d94cd7..b351f7f03 100644
--- a/bitsandbytes/cuda_setup/main.py
+++ b/bitsandbytes/cuda_setup/main.py
@@ -30,7 +30,7 @@
 
 DYNAMIC_LIBRARY_SUFFIX = { "Darwin": ".dylib", "Windows": ".dll", "Linux": ".so"}.get(platform.system(), ".so")
 if platform.system() == "Windows":  # Windows
-    CUDA_RUNTIME_LIBS = ["nvcuda.dll"]
+    CUDA_RUNTIME_LIBS = ["cudart64_110.dll", "cudart64_12.dll"]
 else:  # Linux or other
     # these are the most common libs names
     # libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead

From 20f3eea787f577a5c11ad75f6be83b94c2a882ff Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Wed, 28 Feb 2024 15:48:54 +0100
Subject: [PATCH 12/13] docs: add header for compilation from source

---
 docs/source/installation.mdx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index c6d1f27ca..f701f08d0 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -21,6 +21,8 @@ To install from PyPI.
 pip install bitsandbytes
 ```
 
+## Alternative: Compiling from source
+
 To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. Make sure you have a compiler installed to compile C++ (gcc, make, headers, etc.). For example, to install a compiler and CMake on Ubuntu:
 
 ```bash

From f9eba9c8dd3ffc7d59036fbd16c2b0c498fd3041 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Wed, 28 Feb 2024 09:52:19 -0500
Subject: [PATCH 13/13] (ci) update apt repo before aarch64 build tools are
 installed (#1096)

---
 .github/workflows/python-package.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index c85cd063d..a25f53f46 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -56,7 +56,8 @@ jobs:
         build_os=${{ matrix.os }}
         build_arch=${{ matrix.arch }}
         if [ ${build_os:0:6} == ubuntu -a ${build_arch} == aarch64 ]; then
-          # Allow cross-compile om aarch64
+          # Allow cross-compile on aarch64
+          sudo apt-get update
           sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu g++-aarch64-linux-gnu
           cmake -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ -DCOMPUTE_BACKEND=cpu .
         elif [ ${build_os:0:5} == macos -a ${build_arch} == aarch64 ]; then