Enable crate-ci/typos lint; fix typos (#1005)

Co-authored-by: Titus von Koeller <[email protected]> fix erroneous correction
bitsandbytes-foundation · Feb 5, 2024 · 8c507d9 · 8c507d9
1 parent 8a14c63
commit 8c507d9
Show file tree

Hide file tree

Showing 14 changed files with 68 additions and 46 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -17,3 +17,7 @@ repos:
       - id: mixed-line-ending
         args:
           - --fix=lf
+  - repo: https://github.com/crate-ci/typos
+    rev: v1.17.2
+    hooks:
+      - id: typos
diff --git a/_typos.toml b/_typos.toml
@@ -0,0 +1,11 @@
+[files]
+
+[default.extend-identifiers]
+
+[type.py.extend-words]
+"BA" = "BA"  # used as a commented-out variable in tests
+
+[type.cuda.extend-words]
+"subtile" = "subtile"
+"subtiles" = "subtiles"
+"transation" = "transation"  # TODO: is this transition, transaction, translation..?
diff --git a/benchmarking/switchback/make_plot_with_jsonl.py b/benchmarking/switchback/make_plot_with_jsonl.py
@@ -36,8 +36,8 @@
 
         ('x_quantize_rowwise', 'P', '--', 'C4', 'Quantize rowwise X (switchback)'),
         ('g_quantize_rowwise', 'P', '-.', 'C4', 'Quantize rowwise G (switchback)'),
-        ('w_quantize_global', '.', '--', 'C4', 'Quatnize global W (switchback)'),
-        ('w_quantize_global_transpose', '.', '-.', 'C4', 'Quantize gloabl and\ntranspose W (switchback)'),
+        ('w_quantize_global', '.', '--', 'C4', 'Quantize global W (switchback)'),
+        ('w_quantize_global_transpose', '.', '-.', 'C4', 'Quantize global and\ntranspose W (switchback)'),
     ]:
         xs = []
         ys = []

diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py
@@ -4,7 +4,7 @@
     [ ] TODO: Q - What if we have multiple GPUs of different makes?
 - CUDA version
 - Software:
-    - CPU-only: only CPU quantization functions (no optimizer, no matrix multipl)
+    - CPU-only: only CPU quantization functions (no optimizer, no matrix multiply)
     - CuBLAS-LT: full-build 8-bit optimizer
     - no CuBLAS-LT: no 8-bit matrix multiplication (`nomatmul`)
 
@@ -263,7 +263,7 @@ def warn_in_case_of_duplicates(results_paths: Set[Path]) -> None:
         warning_msg = (
             f"Found duplicate {CUDA_RUNTIME_LIBS} files: {results_paths}.. "
             "We select the PyTorch default libcudart.so, which is {torch.version.cuda},"
-            "but this might missmatch with the CUDA version that is needed for bitsandbytes."
+            "but this might mismatch with the CUDA version that is needed for bitsandbytes."
             "To override this behavior set the BNB_CUDA_VERSION=<version string, e.g. 122> environmental variable"
             "For example, if you want to use the CUDA version 122"
             "BNB_CUDA_VERSION=122 python ..."

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
@@ -120,7 +120,7 @@ def get_instance(cls):
         return cls._instance
 
     def prefetch_all(self, to_cpu=False):
-        # assume the first added, will be hte
+        # assume the first added, will be the
         # ones that are used first, so swap them in last
         # in the case they are evicted again
         for t in self.paged_tensors[::-1]:
@@ -219,7 +219,7 @@ def elementwise_func(func_name, A, B, value, prefetch=True):
         # paged function are fully asynchronous
         # if we return from this function, we want to the tensor
         # to be in the correct state, that is the final state after the
-        # operation occured. So we synchronize.
+        # operation occurred. So we synchronize.
         torch.cuda.synchronize()
 
 def fill(A, value, device=None, prefetch=True): elementwise_func('fill', A, None, value)
@@ -589,7 +589,7 @@ def estimate_quantiles(A: Tensor, out: Optional[torch.Tensor] = None, offset: fl
 
 
 class QuantState:
-    """container for quantization state components to work with Params4bit and similar clases"""
+    """container for quantization state components to work with Params4bit and similar classes"""
     valid_quant_types = ('fp4', 'nf4')
     valid_qs_type_keys = [f"bitsandbytes__{x}" for x in valid_quant_types]
     valid_qs_keys = ['absmax', 'quant_map', 'nested_absmax', 'nested_quant_map', 'quant_state', 'quant_type',

diff --git a/csrc/kernels.cu b/csrc/kernels.cu
@@ -134,10 +134,10 @@ __device__ unsigned char dQuantizeFP4(float x)
 
   // we do a binary search
   // the pivots are divided by 12 (the FP4 absmax)
-  // since we assum input data is in [-1.0, 1.0]
+  // since we assume input data is in [-1.0, 1.0]
 
   // !be careful here, its easy to make a mistake
-  // that is difficult to noice if you add an extra
+  // that is difficult to notice if you add an extra
   // zero somewhere!
 
   int sign = x < 0 ? 0b1000 : 0b0000;
@@ -2259,8 +2259,8 @@ template<typename T, int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_
   }
 
   // 4. store data via atomicMax
-  // to store col data efficienctly we need to rewrite the smem blocked data [0, 1, 2, 3...] for t0
-  // into a striped arangement: [0, 8, 16, 24, ..] for t0
+  // to store col data efficiently we need to rewrite the smem blocked data [0, 1, 2, 3...] for t0
+  // into a striped arrangement: [0, 8, 16, 24, ..] for t0
   __syncthreads();
   BlockExchange(temp_storage.exchange).BlockedToStriped(local_col_absmax_values);
 
@@ -2310,7 +2310,7 @@ template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kd
 
   // data is in 32 column-tile major with tile width 32 columns and numRows rows
   // L1. Load sub-tile row/col statistics. Each thread only holds 1 col, load rows into shared memory.
-  // L2. Load data in warp-striped arangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
+  // L2. Load data in warp-striped arrangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
   // C1. Compute val(row_stat*col_stat)/(127*127) (load 1/(127*127 into register))
   // C2. Compute normalization values and store col values in register
   // S1. Store C1 into 16-bit output
@@ -2383,7 +2383,7 @@ template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kd
     if(valid_items <= 0) // the sub-tile might have more elements than the tile itself
       break;
 
-    // L2. Load data in warp-striped arangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
+    // L2. Load data in warp-striped arrangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
     LoadInt32(loadint32).Load(&(A[subtile_idx]), local_values, valid_items, 0);
     ExchangeInt32(exchangeint32).BlockedToWarpStriped(local_values, local_values);
 
@@ -2650,7 +2650,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
                   // row1 [col0 col1 ... col31]
                   // ...
                   //
-                  // As such we read consequtive entries with 256 threads (8rows x 32 columns)
+                  // As such we read consecutive entries with 256 threads (8rows x 32 columns)
                   // as j increase, the row increase by a factor of 8
                   // We load 8 rows per subrow loop, and subrow increase by 8 per loop
                   // so we have an offset of 8 rows every loop or (subrow/warps)*8 = (subrow/8)*8
@@ -2747,7 +2747,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
                     // each of these has 32 values in total for 32*4 = 128 as offset if odd
                     // every set of 4 columns increases the total offset by 16
                     // each even row increase the offset by 4, for example row 2 is offset by 4, 4 by 6 etc so: subrow/2*4 = subrow*2
-                    // this happends every 8 rows anew (subrow % 8)
+                    // this happens every 8 rows anew (subrow % 8)
                     // one writes 4 columns at once that is (col % 4) for the particular index in the subtile
                     int subcol = warp_lane;
 
@@ -3073,7 +3073,7 @@ template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *
 //// 4. do dequantization from register of B into second pair of registers
 //// 5. store (4) into fragment
 //// 6. matmul aggregate into fragment C
-//// 7. aggreecate files of C into shared memroy block C
+//// 7. aggreecate files of C into shared memory block C
 //// 8. sum (7)
 //// 9. write outputs to matmul output matrix
 //}

diff --git a/deploy.sh b/deploy.sh
@@ -5,7 +5,7 @@ echo "MAKE SURE LD_LIBRARY_PATH IS EMPTY!"
 echo $LD_LIBRARY_PATH
 
 if [[ ! -z "${LD_LIBRARY_PATH}" ]]; then
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -24,7 +24,7 @@ make cpuonly CUDA_VERSION="CPU"
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cpu.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -34,7 +34,7 @@ make cuda110 CUDA_VERSION=110
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -44,7 +44,7 @@ make cuda11x CUDA_VERSION=111
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -54,7 +54,7 @@ make cuda11x CUDA_VERSION=114
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -64,7 +64,7 @@ make cuda11x CUDA_VERSION=115
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -74,7 +74,7 @@ make cuda11x CUDA_VERSION=117
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -84,7 +84,7 @@ make cuda118 CUDA_VERSION=118
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda118.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -94,7 +94,7 @@ make cuda12x CUDA_VERSION=120
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda120.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -104,7 +104,7 @@ make cuda12x CUDA_VERSION=121
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -114,7 +114,7 @@ make cuda12x CUDA_VERSION=122
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -124,7 +124,7 @@ make cuda12x CUDA_VERSION=123
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda123.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -138,7 +138,7 @@ make cuda110_nomatmul CUDA_VERSION=110
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110_nocublaslt.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -149,7 +149,7 @@ make cuda11x_nomatmul CUDA_VERSION=111
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111_nocublaslt.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -159,7 +159,7 @@ make cuda11x_nomatmul CUDA_VERSION=114
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114_nocublaslt.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -169,7 +169,7 @@ make cuda11x_nomatmul CUDA_VERSION=115
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115_nocublaslt.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -179,7 +179,7 @@ make cuda11x_nomatmul CUDA_VERSION=117
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117_nocublaslt.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -189,7 +189,7 @@ make cuda118_nomatmul CUDA_VERSION=118
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda118_nocublaslt.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -199,7 +199,7 @@ make cuda12x_nomatmul CUDA_VERSION=120
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda120_nocublaslt.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -209,7 +209,7 @@ make cuda12x_nomatmul CUDA_VERSION=121
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121_nocublaslt.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -219,7 +219,7 @@ make cuda12x_nomatmul CUDA_VERSION=122
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122_nocublaslt.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 
@@ -229,7 +229,7 @@ make cuda12x_nomatmul CUDA_VERSION=123
 
 if [ ! -f "./bitsandbytes/libbitsandbytes_cuda123_nocublaslt.so" ]; then
   # Control will enter here if $DIRECTORY doesn't exist.
-  echo "Compilation unsuccessul!" 1>&2
+  echo "Compilation unsuccessful!" 1>&2
   exit 64
 fi
 

diff --git a/docs/source/contributing.mdx b/docs/source/contributing.mdx
@@ -1,5 +1,5 @@
 # Contributors guidelines
-... stil under construction ... (feel free to propose materials, `bitsandbytes` is a community project)
+... still under construction ... (feel free to propose materials, `bitsandbytes` is a community project)
 
 ## Setup pre-commit hooks
 - Install pre-commit hooks with `pip install pre-commit`.

diff --git a/docs/source/integrations.mdx b/docs/source/integrations.mdx
@@ -29,7 +29,7 @@ Please review the [bitsandbytes section in the Accelerate docs](https://huggingf
 
 # Trainer for the optimizers
 
-You can use any of the 8-bit and/or paged optimizers by simple passing them to the `transformers.Trainer` class on intialization.All bnb optimizers are supported by passing the correct string in `TrainingArguments`'s `optim` attribute - e.g. (`paged_adamw_32bit`).
+You can use any of the 8-bit and/or paged optimizers by simple passing them to the `transformers.Trainer` class on initialization.All bnb optimizers are supported by passing the correct string in `TrainingArguments`'s `optim` attribute - e.g. (`paged_adamw_32bit`).
 
 See the [official API docs for reference](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Trainer).
 

diff --git a/docs/source/optimizers.mdx b/docs/source/optimizers.mdx
@@ -168,9 +168,9 @@ Possible options for the config override are: `betas, eps, weight_decay, lr, opt
 For overrides for particular layers, we recommend overriding locally in each module. You can do this by passing the module, the parameter, and its attribute name to the GlobalOptimManager:
 ```py
 class MyModule(torch.nn.Module):
-  def __init__(din, dout):
+  def __init__(d_in, d_out):
     super(MyModule, self).__init__()
-    self.linear = torch.nn.Linear(din, dout)
+    self.linear = torch.nn.Linear(d_in, d_out)
     # optimization will happen in 32-bit and
     # learning rate will be set to 0.0001 independent of the main learning rate
     config = {'optim_bits': 32, 'lr' : 0.0001}

diff --git a/include/Algo-Direct2.h b/include/Algo-Direct2.h
@@ -157,7 +157,7 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
         FVec<AVX, float> vxp = _mm256_i32gather_ps(xi, idxp, sizeof(float));
         IVec<AVX, float> ip = idxm;
 
-#else // do not use gather instrucions
+#else // do not use gather instructions
 
         union U {
             __m256i vec;

diff --git a/include/Portable.h b/include/Portable.h
@@ -147,5 +147,5 @@ inline T prev(T x)
     return x;
 }
 
-} // namepsace Details
+} // namespace Details
 } // namespace BinSearch
diff --git a/include/SIMD.h b/include/SIMD.h
@@ -568,5 +568,5 @@ FORCE_INLINE FVec<AVX, double> mulSub(const FVec<AVX, double>& a, const FVec<AVX
 
 #endif
 
-} // namepsace Details
+} // namespace Details
 } // namespace BinSearch