From e820409c095ea7cbb5ce156992307b84352cbf90 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Thu, 22 Feb 2024 12:16:40 -0500 Subject: [PATCH 01/13] (docs) Clarify Python and CUDA Toolkit version requirement (#1076) (misc) Update CUDA download URLs --- bitsandbytes/cuda_setup/main.py | 4 ++-- docs/source/installation.mdx | 8 ++++---- install_cuda.py | 15 +++++---------- install_cuda.sh | 32 ++++++++------------------------ 4 files changed, 19 insertions(+), 40 deletions(-) diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index 14c7abbd8..cd0d94cd7 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -161,7 +161,7 @@ def run_cuda_setup(self): self.add_log_entry('3. CUDA not installed') self.add_log_entry('4. You have multiple conflicting CUDA libraries') self.add_log_entry('5. Required library not pre-compiled for this bitsandbytes release!') - self.add_log_entry('CUDA SETUP: If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION` for example, `make CUDA_VERSION=113`.') + self.add_log_entry('CUDA SETUP: If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION` for example, `make CUDA_VERSION=118`.') self.add_log_entry('CUDA SETUP: The CUDA version for the compile might depend on your conda install. Inspect CUDA version via `conda list | grep cuda`.') self.add_log_entry('='*80) self.add_log_entry('') @@ -268,7 +268,7 @@ def warn_in_case_of_duplicates(results_paths: Set[Path]) -> None: "BNB_CUDA_VERSION=122 python ..." "OR set the environmental variable in your .bashrc: export BNB_CUDA_VERSION=122" "In the case of a manual override, make sure you set the LD_LIBRARY_PATH, e.g." - "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.2") + "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.2") CUDASetup.get_instance().add_log_entry(warning_msg, is_warning=True) diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index f055e44f0..c6d1f27ca 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -1,6 +1,6 @@ # Installation -bitsandbytes is only supported on CUDA GPUs for CUDA versions **10.2 - 12.0**. Select your operating system below to see the installation instructions. +bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.3**. Select your operating system below to see the installation instructions. @@ -21,7 +21,7 @@ To install from PyPI. pip install bitsandbytes ``` -To compile from source, you need CMake >= **3.22.1** and Python >= **3.10** installed. Make sure you have a compiler installed to compile C++ (gcc, make, headers, etc.). For example, to install a compiler and CMake on Ubuntu: +To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. Make sure you have a compiler installed to compile C++ (gcc, make, headers, etc.). For example, to install a compiler and CMake on Ubuntu: ```bash apt-get install -y build-essential cmake @@ -47,7 +47,7 @@ pip install . Windows systems require Visual Studio with C++ support as well as an installation of the CUDA SDK. -You'll need to build bitsandbytes from source. To compile from source, you need CMake >= **3.22.1** and Python >= **3.10** installed. You should also install CUDA Toolkit by following the [CUDA Installation Guide for Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html) guide from NVIDIA. +You'll need to build bitsandbytes from source. To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. You should also install CUDA Toolkit by following the [CUDA Installation Guide for Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html) guide from NVIDIA. ```bash git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/ @@ -82,7 +82,7 @@ Then locally install the CUDA version you need with this script from bitsandbyte ```bash wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH -# CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122} +# CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123} # EXPORT_TO_BASH in {0, 1} with 0=False and 1=True # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc diff --git a/install_cuda.py b/install_cuda.py index 4b041b8d0..b41b33b39 100644 --- a/install_cuda.py +++ b/install_cuda.py @@ -4,10 +4,6 @@ from urllib.request import urlretrieve cuda_versions = { - "92": "https://developer.nvidia.com/compute/cuda/9.2/Prod2/local_installers/cuda_9.2.148_396.37_linux", - "100": "https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux", - "101": "https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.105_418.39_linux.run", - "102": "https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run", "110": "https://developer.download.nvidia.com/compute/cuda/11.0.3/local_installers/cuda_11.0.3_450.51.06_linux.run", "111": "https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda_11.1.1_455.32.00_linux.run", "112": "https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run", @@ -15,15 +11,14 @@ "114": "https://developer.download.nvidia.com/compute/cuda/11.4.4/local_installers/cuda_11.4.4_470.82.01_linux.run", "115": "https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run", "116": "https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run", - "117": "https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda_11.7.0_515.43.04_linux.run", + "117": "https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_linux.run", "118": "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run", - "120": "https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda_12.0.0_525.60.13_linux.run", - "121": "https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run", - "122": "https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run", - "123": "https://developer.download.nvidia.com/compute/cuda/12.3.1/local_installers/cuda_12.3.1_545.23.08_linux.run", + "120": "https://developer.download.nvidia.com/compute/cuda/12.0.1/local_installers/cuda_12.0.1_525.85.12_linux.run", + "121": "https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run", + "122": "https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run", + "123": "https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_545.23.08_linux.run", } - def install_cuda(version, base_path, download_path): formatted_version = f"{version[:-1]}.{version[-1]}" folder = f"cuda-{formatted_version}" diff --git a/install_cuda.sh b/install_cuda.sh index 70263da15..8ffbc8478 100644 --- a/install_cuda.sh +++ b/install_cuda.sh @@ -1,7 +1,3 @@ -URL92=https://developer.nvidia.com/compute/cuda/9.2/Prod2/local_installers/cuda_9.2.148_396.37_linux -URL100=https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux -URL101=https://developer.nvidia.com/compute/cuda/10.1/Prod/local_installers/cuda_10.1.105_418.39_linux.run -URL102=https://developer.download.nvidia.com/compute/cuda/10.2/Prod/local_installers/cuda_10.2.89_440.33.01_linux.run URL110=https://developer.download.nvidia.com/compute/cuda/11.0.3/local_installers/cuda_11.0.3_450.51.06_linux.run URL111=https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda_11.1.1_455.32.00_linux.run URL112=https://developer.download.nvidia.com/compute/cuda/11.2.2/local_installers/cuda_11.2.2_460.32.03_linux.run @@ -9,12 +5,12 @@ URL113=https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installer URL114=https://developer.download.nvidia.com/compute/cuda/11.4.4/local_installers/cuda_11.4.4_470.82.01_linux.run URL115=https://developer.download.nvidia.com/compute/cuda/11.5.2/local_installers/cuda_11.5.2_495.29.05_linux.run URL116=https://developer.download.nvidia.com/compute/cuda/11.6.2/local_installers/cuda_11.6.2_510.47.03_linux.run -URL117=https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda_11.7.0_515.43.04_linux.run +URL117=https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_linux.run URL118=https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run -URL120=https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda_12.0.0_525.60.13_linux.run +URL120=https://developer.download.nvidia.com/compute/cuda/12.0.1/local_installers/cuda_12.0.1_525.85.12_linux.run URL121=https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run -URL122=https://developer.download.nvidia.com/compute/cuda/12.2.1/local_installers/cuda_12.2.1_535.86.10_linux.run -URL123=https://developer.download.nvidia.com/compute/cuda/12.3.1/local_installers/cuda_12.3.1_545.23.08_linux.run +URL122=https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run +URL123=https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_545.23.08_linux.run CUDA_VERSION=$1 @@ -22,28 +18,16 @@ BASE_PATH=$2 EXPORT_BASHRC=$3 if [[ -n "$CUDA_VERSION" ]]; then - if [[ "$CUDA_VERSION" -eq "92" ]]; then - URL=$URL92 - FOLDER=cuda-9.2 - elif [[ "$CUDA_VERSION" -eq "100" ]]; then - URL=$URL100 - FOLDER=cuda-10.0 - elif [[ "$CUDA_VERSION" -eq "101" ]]; then - URL=$URL101 - FOLDER=cuda-10.1 - elif [[ "$CUDA_VERSION" -eq "102" ]]; then - URL=$URL102 - FOLDER=cuda-10.2 - elif [[ "$CUDA_VERSION" -eq "110" ]]; then + if [[ "$CUDA_VERSION" -eq "110" ]]; then URL=$URL110 FOLDER=cuda-11.0 - elif [[ "$CUDA_VERSION" -eq "111" ]]; then + elif [[ "$CUDA_VERSION" -eq "111" ]]; then URL=$URL111 FOLDER=cuda-11.1 - elif [[ "$CUDA_VERSION" -eq "112" ]]; then + elif [[ "$CUDA_VERSION" -eq "112" ]]; then URL=$URL112 FOLDER=cuda-11.2 - elif [[ "$CUDA_VERSION" -eq "113" ]]; then + elif [[ "$CUDA_VERSION" -eq "113" ]]; then URL=$URL113 FOLDER=cuda-11.3 elif [[ "$CUDA_VERSION" -eq "114" ]]; then From 1626374d318c1e5253bfeb8ec9ef80473a807d65 Mon Sep 17 00:00:00 2001 From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Fri, 23 Feb 2024 09:44:25 +0000 Subject: [PATCH 02/13] upgrade pre-commit config --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index edcbc9b6b..c8ccfe8df 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,6 +18,6 @@ repos: args: - --fix=lf - repo: https://github.com/crate-ci/typos - rev: v1.17.2 + rev: v1.18.2 hooks: - id: typos From 5d6dfe6fb43e5aae277ec86cba20a002b34df705 Mon Sep 17 00:00:00 2001 From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Fri, 23 Feb 2024 09:46:11 +0000 Subject: [PATCH 03/13] fix newly found typo due to upgraded typos pkg --- csrc/kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 6d15dbe64..f4673359b 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3075,7 +3075,7 @@ template __global__ void kExtractOutliers(char *A, int *idx, char * //// 4. do dequantization from register of B into second pair of registers //// 5. store (4) into fragment //// 6. matmul aggregate into fragment C -//// 7. aggreecate files of C into shared memory block C +//// 7. aggregate files of C into shared memory block C //// 8. sum (7) //// 9. write outputs to matmul output matrix //} From 1f36bd4cf24d221e61cf2609b7c6170e955222bf Mon Sep 17 00:00:00 2001 From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Mon, 26 Feb 2024 16:12:46 +0100 Subject: [PATCH 04/13] docs: fix link text --- docs/source/integrations.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/integrations.mdx b/docs/source/integrations.mdx index 0e37765c5..bcba6e5e5 100644 --- a/docs/source/integrations.mdx +++ b/docs/source/integrations.mdx @@ -2,7 +2,7 @@ With Transformers it's very easy to load any model in 4 or 8-bit, quantizing them on the fly with bitsandbytes primitives. -Please review the [bitsandbytes section in the Accelerate docs](https://huggingface.co/docs/transformers/v4.37.2/en/quantization#bitsandbytes). +Please review the [bitsandbytes section in the Transformers docs](https://huggingface.co/docs/transformers/v4.37.2/en/quantization#bitsandbytes). Details about the BitsAndBytesConfig can be found [here](https://huggingface.co/docs/transformers/v4.37.2/en/main_classes/quantization#transformers.BitsAndBytesConfig). @@ -21,7 +21,7 @@ quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dty # PEFT With `PEFT`, you can use QLoRA out of the box with `LoraConfig` and a 4-bit base model. -Please review the [bitsandbytes section in the Accelerate docs](https://huggingface.co/docs/peft/developer_guides/quantization#quantize-a-model). +Please review the [bitsandbytes section in the PEFT docs](https://huggingface.co/docs/peft/developer_guides/quantization#quantize-a-model). # Accelerate From a03df4325dfa8e25f9780d1b854870d85a972898 Mon Sep 17 00:00:00 2001 From: Sebastian Raschka Date: Mon, 26 Feb 2024 13:42:23 -0600 Subject: [PATCH 05/13] Lit-GPT integration docs (#1089) * lit-gpt integration * mention PT lightning --- docs/source/integrations.mdx | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docs/source/integrations.mdx b/docs/source/integrations.mdx index bcba6e5e5..67d50d6a0 100644 --- a/docs/source/integrations.mdx +++ b/docs/source/integrations.mdx @@ -29,6 +29,25 @@ Bitsandbytes is also easily usable from within Accelerate. Please review the [bitsandbytes section in the Accelerate docs](https://huggingface.co/docs/accelerate/en/usage_guides/quantization). + + +# PyTorch Lightning and Lightning Fabric + +Bitsandbytes is available from within both +- [PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/), a deep learning framework for professional AI researchers and machine learning engineers who need maximal flexibility without sacrificing performance at scale; +- and [Lightning Fabric](https://lightning.ai/docs/fabric/stable/), a fast and lightweight way to scale PyTorch models without boilerplate). + +Please review the [bitsandbytes section in the PyTorch Lightning docs](https://lightning.ai/docs/pytorch/stable/common/precision_intermediate.html#quantization-via-bitsandbytes). + + +# Lit-GPT + +Bitsandbytes is integrated into [Lit-GPT](https://github.com/Lightning-AI/lit-gpt), a hackable implementation of state-of-the-art open-source large language models, based on Lightning Fabric, where it can be used for quantization during training, finetuning, and inference. + +Please review the [bitsandbytes section in the Lit-GPT quantization docs](https://github.com/Lightning-AI/lit-gpt/blob/main/tutorials/quantize.md). + + + # Trainer for the optimizers You can use any of the 8-bit and/or paged optimizers by simple passing them to the `transformers.Trainer` class on initialization.All bnb optimizers are supported by passing the correct string in `TrainingArguments`'s `optim` attribute - e.g. (`paged_adamw_32bit`). From 433275e3791122a21900474ae0eac8150ec344d2 Mon Sep 17 00:00:00 2001 From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Tue, 27 Feb 2024 10:39:04 +0100 Subject: [PATCH 06/13] improve accelerate reference in docs (#1086) * improve accelerate reference in docs * Apply suggestions from code review Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> * fix spelling --------- Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --- docs/source/integrations.mdx | 40 ++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/docs/source/integrations.mdx b/docs/source/integrations.mdx index 67d50d6a0..48b4d6060 100644 --- a/docs/source/integrations.mdx +++ b/docs/source/integrations.mdx @@ -1,8 +1,8 @@ # Transformers -With Transformers it's very easy to load any model in 4 or 8-bit, quantizing them on the fly with bitsandbytes primitives. +With Transformers it's very easy to load any model in 4 or 8-bit, quantizing them on the fly with `bitsandbytes` primitives. -Please review the [bitsandbytes section in the Transformers docs](https://huggingface.co/docs/transformers/v4.37.2/en/quantization#bitsandbytes). +Please review the [`bitsandbytes` section in the Transformers docs](https://huggingface.co/docs/transformers/main/en/quantization#bitsandbytes). Details about the BitsAndBytesConfig can be found [here](https://huggingface.co/docs/transformers/v4.37.2/en/main_classes/quantization#transformers.BitsAndBytesConfig). @@ -25,9 +25,37 @@ Please review the [bitsandbytes section in the PEFT docs](https://huggingface.co # Accelerate -Bitsandbytes is also easily usable from within Accelerate. +Bitsandbytes is also easily usable from within Accelerate, where you can quantize any PyTorch model simply by passing a quantization config; e.g: -Please review the [bitsandbytes section in the Accelerate docs](https://huggingface.co/docs/accelerate/en/usage_guides/quantization). +```py +from accelerate import init_empty_weights +from accelerate.utils import BnbQuantizationConfig, load_and_quantize_model +from mingpt.model import GPT + +model_config = GPT.get_default_config() +model_config.model_type = 'gpt2-xl' +model_config.vocab_size = 50257 +model_config.block_size = 1024 + +with init_empty_weights(): + empty_model = GPT(model_config) + +bnb_quantization_config = BnbQuantizationConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.bfloat16, # optional + bnb_4bit_use_double_quant=True, # optional + bnb_4bit_quant_type="nf4" # optional +) + +quantized_model = load_and_quantize_model( + empty_model, + weights_location=weights_location, + bnb_quantization_config=bnb_quantization_config, + device_map = "auto" +) +``` + +For further details, e.g. model saving, cpu-offloading andfine-tuning, please review the [`bitsandbytes` section in the Accelerate docs](https://huggingface.co/docs/accelerate/en/usage_guides/quantization). @@ -59,5 +87,5 @@ e.g. for transformers state that you can load any model in 8-bit / 4-bit precisi # Blog posts -- [Making LLMs even more accessible with bitsandbytes, 4-bit quantization and QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes) -- [A Gentle Introduction to 8-bit Matrix Multiplication for transformers at scale using Hugging Face Transformers, Accelerate and bitsandbytes](https://huggingface.co/blog/hf-bitsandbytes-integration) +- [Making LLMs even more accessible with `bitsandbytes`, 4-bit quantization and QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes) +- [A Gentle Introduction to 8-bit Matrix Multiplication for transformers at scale using Hugging Face Transformers, Accelerate and `bitsandbytes`](https://huggingface.co/blog/hf-bitsandbytes-integration) From 753df25c7fed6683b7d4562319849192ec8d9873 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Tue, 27 Feb 2024 12:24:20 -0500 Subject: [PATCH 07/13] (cmake) Fix cuda arch selection (#1091) * (cmake) Fix generation of targets for nvcc * Typo * (ci) linux + CUDA workflow: make sure we specify target architectures * fix * fix one more time * (cmake) Default in CMAKE_CUDA_ARCHITECTURES_ALL when cmake<3.23, make sure we build only selected cubins and only ptx for latest capability * Fix static lookup for CMAKE_CUDA_ARCHITECTURES_ALL on cmake<3.23 * Remove debug setting * clarification --- .github/workflows/python-package.yml | 2 +- CMakeLists.txt | 44 ++++++++++++++++++++++++---- 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index e48c25cc5..faa30ca30 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -125,7 +125,7 @@ jobs: docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \ "apt-get update \ && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \ - && cmake -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} . \ + && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"50;52;60;61;70;75;80;86;89;90\" -DNO_CUBLASLT=${NO_CUBLASLT} . \ && cmake --build ." else cmake -G Ninja -DCOMPUTE_BACKEND=cuda -DNO_CUBLASLT=${NO_CUBLASLT} -DCMAKE_BUILD_TYPE=Release -S . diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b9f1854b..7f70a089e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,7 +33,7 @@ endif() set(BNB_OUTPUT_NAME "bitsandbytes") -message(STATUS "Building with backend ${COMPUTE_BACKEND}") +message(STATUS "Configuring ${PROJECT_NAME} (Backend: ${COMPUTE_BACKEND})") if(${COMPUTE_BACKEND} STREQUAL "cuda") if(APPLE) @@ -82,6 +82,31 @@ if(BUILD_CUDA) message(FATAL_ERROR "CUDA Version > 12 is not supported") endif() + # CMake < 3.23.0 does not define CMAKE_CUDA_ARCHITECTURES_ALL. + if(CMAKE_VERSION VERSION_LESS "3.23.0") + message(STATUS "CMake < 3.23.0; determining CUDA architectures supported...") + + # 11.x and 12.x both support these at a minimum. + set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80) + set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80) + + # CUDA 11.1 adds Ampere support for GA102-GA107. + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.1") + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 86) + endif() + + # CUDA 11.4 adds Ampere support for GA10B. + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.4") + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 87) + endif() + + # CUDA 11.8 adds support for Ada and Hopper. + if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8") + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 89 90) + list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 90) + endif() + endif() + string(APPEND CMAKE_CUDA_FLAGS " --use_fast_math") if(PTXAS_VERBOSE) # Verbose? Outputs register usage information, and other things... @@ -103,10 +128,18 @@ if(BUILD_CUDA) message(STATUS "CUDA Capabilities Available: ${POSSIBLE_CAPABILITIES}") message(STATUS "CUDA Capabilities Selected: ${COMPUTE_CAPABILITY}") - foreach(capability ${COMPUTE_CAPABILITY}) - string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_${capability},code=sm_${capability}") - endforeach() - + # Use the "real" option to build native cubin for all selections. + # Ensure we build the PTX for the latest version. + # This behavior of adding a PTX (virtual) target for the highest architecture + # is similar to how the "all" and "all-major" options would behave in CMake >= 3.23. + # TODO: Consider bumping CMake requirement and using CMAKE_CUDA_ARCHITECTURES=[all | native] by default + list(REMOVE_DUPLICATES COMPUTE_CAPABILITY) + list(SORT COMPUTE_CAPABILITY COMPARE NATURAL) + list(POP_BACK COMPUTE_CAPABILITY _LATEST_CAPABILITY) + list(TRANSFORM COMPUTE_CAPABILITY APPEND "-real" OUTPUT_VARIABLE CMAKE_CUDA_ARCHITECTURES) + list(APPEND CMAKE_CUDA_ARCHITECTURES ${_LATEST_CAPABILITY}) + + message(STATUS "CUDA Targets: ${CMAKE_CUDA_ARCHITECTURES}") message(STATUS "CUDA NVCC Flags: ${CMAKE_CUDA_FLAGS}") list(APPEND SRC_FILES ${CUDA_FILES}) @@ -149,7 +182,6 @@ endif() # Weird MSVC hacks if(MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2 /fp:fast") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2 /fp:fast") endif() set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX) From cc5f8cd8b9f6f97f30b85322780359851ee2caf1 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Tue, 27 Feb 2024 13:03:42 -0500 Subject: [PATCH 08/13] (cmake) Update library output directory (#1080) --- CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7f70a089e..62ff4e535 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -214,10 +214,10 @@ if(WIN32) endif() set_target_properties(bitsandbytes PROPERTIES OUTPUT_NAME ${BNB_OUTPUT_NAME}) if(MSVC) - set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE bitsandbytes) - set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG bitsandbytes) - set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE bitsandbytes) - set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG bitsandbytes) + set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes") + set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes") + set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes") + set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes") endif() set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY bitsandbytes) From 4b232edf8c923dab4a0059a449cd3dab3201e4d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Rodr=C3=ADguez=20Salamanca?= Date: Tue, 27 Feb 2024 19:46:20 +0100 Subject: [PATCH 09/13] Fix example int8_inference_huggingface.py (#414) * Fix example int8_inference_huggingface.py * Update examples/int8_inference_huggingface.py Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> --------- Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> --- examples/int8_inference_huggingface.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/int8_inference_huggingface.py b/examples/int8_inference_huggingface.py index 2cee48e8e..c89ba8d11 100644 --- a/examples/int8_inference_huggingface.py +++ b/examples/int8_inference_huggingface.py @@ -1,24 +1,24 @@ import torch -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import LlamaForCausalLM, LlamaTokenizer MAX_NEW_TOKENS = 128 -model_name = 'decapoda-research/llama-7b-hf' +model_name = 'meta-llama/Llama-2-7b-hf' text = 'Hamburg is in which country?\n' -tokenizer = AutoTokenizer.from_pretrained(model_name) +tokenizer = LlamaTokenizer.from_pretrained(model_name) input_ids = tokenizer(text, return_tensors="pt").input_ids -free_in_GB = int(torch.cuda.mem_get_info()[0]/1024**3) max_memory = f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB' n_gpus = torch.cuda.device_count() max_memory = {i: max_memory for i in range(n_gpus)} -model = AutoModelForCausalLM.from_pretrained( +model = LlamaForCausalLM.from_pretrained( model_name, device_map='auto', load_in_8bit=True, max_memory=max_memory ) + generated_ids = model.generate(input_ids, max_length=MAX_NEW_TOKENS) print(tokenizer.decode(generated_ids[0], skip_special_tokens=True)) From 1d709aadef94c35ff8a403ab1a65f7343c011074 Mon Sep 17 00:00:00 2001 From: Rickard Date: Tue, 27 Feb 2024 20:15:02 +0100 Subject: [PATCH 10/13] Add concurrency to not waste precious build minutes when modifying PRs frequently. (#1051) Co-authored-by: wkpark --- .github/workflows/python-package.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index faa30ca30..c85cd063d 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -15,10 +15,13 @@ on: - 'setup.py' - 'pyproject.toml' - 'pytest.ini' - - '**/*.md' release: types: [ published ] +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + jobs: ## From 0488566462c24f5016ca76f698ef9d4d95d00b11 Mon Sep 17 00:00:00 2001 From: Won-Kyu Park Date: Wed, 28 Feb 2024 04:22:37 +0900 Subject: [PATCH 11/13] fix cudart*dll for Windows (#1064) --- bitsandbytes/cuda_setup/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py index cd0d94cd7..b351f7f03 100644 --- a/bitsandbytes/cuda_setup/main.py +++ b/bitsandbytes/cuda_setup/main.py @@ -30,7 +30,7 @@ DYNAMIC_LIBRARY_SUFFIX = { "Darwin": ".dylib", "Windows": ".dll", "Linux": ".so"}.get(platform.system(), ".so") if platform.system() == "Windows": # Windows - CUDA_RUNTIME_LIBS = ["nvcuda.dll"] + CUDA_RUNTIME_LIBS = ["cudart64_110.dll", "cudart64_12.dll"] else: # Linux or other # these are the most common libs names # libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead From 20f3eea787f577a5c11ad75f6be83b94c2a882ff Mon Sep 17 00:00:00 2001 From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com> Date: Wed, 28 Feb 2024 15:48:54 +0100 Subject: [PATCH 12/13] docs: add header for compilation from source --- docs/source/installation.mdx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index c6d1f27ca..f701f08d0 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -21,6 +21,8 @@ To install from PyPI. pip install bitsandbytes ``` +## Alternative: Compiling from source + To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. Make sure you have a compiler installed to compile C++ (gcc, make, headers, etc.). For example, to install a compiler and CMake on Ubuntu: ```bash From f9eba9c8dd3ffc7d59036fbd16c2b0c498fd3041 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Wed, 28 Feb 2024 09:52:19 -0500 Subject: [PATCH 13/13] (ci) update apt repo before aarch64 build tools are installed (#1096) --- .github/workflows/python-package.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index c85cd063d..a25f53f46 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -56,7 +56,8 @@ jobs: build_os=${{ matrix.os }} build_arch=${{ matrix.arch }} if [ ${build_os:0:6} == ubuntu -a ${build_arch} == aarch64 ]; then - # Allow cross-compile om aarch64 + # Allow cross-compile on aarch64 + sudo apt-get update sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu g++-aarch64-linux-gnu cmake -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ -DCOMPUTE_BACKEND=cpu . elif [ ${build_os:0:5} == macos -a ${build_arch} == aarch64 ]; then