Skip to content

Commit

Permalink
Enable crate-ci/typos lint; fix typos
Browse files Browse the repository at this point in the history
  • Loading branch information
akx committed Feb 1, 2024
1 parent 6974920 commit 7d81b19
Show file tree
Hide file tree
Showing 12 changed files with 66 additions and 44 deletions.
4 changes: 4 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,7 @@ repos:
- id: mixed-line-ending
args:
- --fix=lf
- repo: https://github.com/crate-ci/typos
rev: v1.17.2
hooks:
- id: typos
11 changes: 11 additions & 0 deletions _typos.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[files]

[default.extend-identifiers]

[type.py.extend-words]
"BA" = "BA" # used as a commented-out variable in tests

[type.cuda.extend-words]
"subtile" = "subtile"
"subtiles" = "subtiles"
"transation" = "transation" # TODO: is this transition, transaction, translation..?
4 changes: 2 additions & 2 deletions benchmarking/switchback/make_plot_with_jsonl.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@

('x_quantize_rowwise', 'P', '--', 'C4', 'Quantize rowwise X (switchback)'),
('g_quantize_rowwise', 'P', '-.', 'C4', 'Quantize rowwise G (switchback)'),
('w_quantize_global', '.', '--', 'C4', 'Quatnize global W (switchback)'),
('w_quantize_global_transpose', '.', '-.', 'C4', 'Quantize gloabl and\ntranspose W (switchback)'),
('w_quantize_global', '.', '--', 'C4', 'Quantize global W (switchback)'),
('w_quantize_global_transpose', '.', '-.', 'C4', 'Quantize global and\ntranspose W (switchback)'),
]:
xs = []
ys = []
Expand Down
4 changes: 2 additions & 2 deletions bitsandbytes/cuda_setup/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
[ ] TODO: Q - What if we have multiple GPUs of different makes?
- CUDA version
- Software:
- CPU-only: only CPU quantization functions (no optimizer, no matrix multipl)
- CPU-only: only CPU quantization functions (no optimizer, no matrix multiply)
- CuBLAS-LT: full-build 8-bit optimizer
- no CuBLAS-LT: no 8-bit matrix multiplication (`nomatmul`)
Expand Down Expand Up @@ -258,7 +258,7 @@ def warn_in_case_of_duplicates(results_paths: Set[Path]) -> None:
warning_msg = (
f"Found duplicate {CUDA_RUNTIME_LIBS} files: {results_paths}.. "
"We select the PyTorch default libcudart.so, which is {torch.version.cuda},"
"but this might missmatch with the CUDA version that is needed for bitsandbytes."
"but this might mismatch with the CUDA version that is needed for bitsandbytes."
"To override this behavior set the BNB_CUDA_VERSION=<version string, e.g. 122> environmental variable"
"For example, if you want to use the CUDA version 122"
"BNB_CUDA_VERSION=122 python ..."
Expand Down
6 changes: 3 additions & 3 deletions bitsandbytes/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def get_instance(cls):
return cls._instance

def prefetch_all(self, to_cpu=False):
# assume the first added, will be hte
# assume the first added, will be the
# ones that are used first, so swap them in last
# in the case they are evicted again
for t in self.paged_tensors[::-1]:
Expand Down Expand Up @@ -219,7 +219,7 @@ def elementwise_func(func_name, A, B, value, prefetch=True):
# paged function are fully asynchronous
# if we return from this function, we want to the tensor
# to be in the correct state, that is the final state after the
# operation occured. So we synchronize.
# operation occurred. So we synchronize.
torch.cuda.synchronize()

def fill(A, value, device=None, prefetch=True): elementwise_func('fill', A, None, value)
Expand Down Expand Up @@ -589,7 +589,7 @@ def estimate_quantiles(A: Tensor, out: Optional[torch.Tensor] = None, offset: fl


class QuantState:
"""container for quantization state components to work with Params4bit and similar clases"""
"""container for quantization state components to work with Params4bit and similar classes"""
valid_quant_types = ('fp4', 'nf4')
valid_qs_type_keys = [f"bitsandbytes__{x}" for x in valid_quant_types]
valid_qs_keys = ['absmax', 'quant_map', 'nested_absmax', 'nested_quant_map', 'quant_state', 'quant_type',
Expand Down
18 changes: 9 additions & 9 deletions csrc/kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,10 @@ __device__ unsigned char dQuantizeFP4(float x)

// we do a binary search
// the pivots are divided by 12 (the FP4 absmax)
// since we assum input data is in [-1.0, 1.0]
// since we assume input data is in [-1.0, 1.0]

// !be careful here, its easy to make a mistake
// that is difficult to noice if you add an extra
// that is difficult to notice if you add an extra
// zero somewhere!

int sign = x < 0 ? 0b1000 : 0b0000;
Expand Down Expand Up @@ -2259,8 +2259,8 @@ template<typename T, int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_
}

// 4. store data via atomicMax
// to store col data efficienctly we need to rewrite the smem blocked data [0, 1, 2, 3...] for t0
// into a striped arangement: [0, 8, 16, 24, ..] for t0
// to store col data efficiency we need to rewrite the smem blocked data [0, 1, 2, 3...] for t0
// into a striped arrangement: [0, 8, 16, 24, ..] for t0
__syncthreads();
BlockExchange(temp_storage.exchange).BlockedToStriped(local_col_absmax_values);

Expand Down Expand Up @@ -2310,7 +2310,7 @@ template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kd

// data is in 32 column-tile major with tile width 32 columns and numRows rows
// L1. Load sub-tile row/col statistics. Each thread only holds 1 col, load rows into shared memory.
// L2. Load data in warp-striped arangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
// L2. Load data in warp-striped arrangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
// C1. Compute val(row_stat*col_stat)/(127*127) (load 1/(127*127 into register))
// C2. Compute normalization values and store col values in register
// S1. Store C1 into 16-bit output
Expand Down Expand Up @@ -2383,7 +2383,7 @@ template <int ITEMS_PER_THREAD, int SUBTILE_ROWS, int THREADS>__global__ void kd
if(valid_items <= 0) // the sub-tile might have more elements than the tile itself
break;

// L2. Load data in warp-striped arangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
// L2. Load data in warp-striped arrangement (t0 holds colidx [0, 0, 0, 0], rowidx [0, 1, 2, 3])
LoadInt32(loadint32).Load(&(A[subtile_idx]), local_values, valid_items, 0);
ExchangeInt32(exchangeint32).BlockedToWarpStriped(local_values, local_values);

Expand Down Expand Up @@ -2650,7 +2650,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
// row1 [col0 col1 ... col31]
// ...
//
// As such we read consequtive entries with 256 threads (8rows x 32 columns)
// As such we read consecutive entries with 256 threads (8rows x 32 columns)
// as j increase, the row increase by a factor of 8
// We load 8 rows per subrow loop, and subrow increase by 8 per loop
// so we have an offset of 8 rows every loop or (subrow/warps)*8 = (subrow/8)*8
Expand Down Expand Up @@ -2747,7 +2747,7 @@ template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int T
// each of these has 32 values in total for 32*4 = 128 as offset if odd
// every set of 4 columns increases the total offset by 16
// each even row increase the offset by 4, for example row 2 is offset by 4, 4 by 6 etc so: subrow/2*4 = subrow*2
// this happends every 8 rows anew (subrow % 8)
// this happens every 8 rows anew (subrow % 8)
// one writes 4 columns at once that is (col % 4) for the particular index in the subtile
int subcol = warp_lane;

Expand Down Expand Up @@ -3073,7 +3073,7 @@ template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *
//// 4. do dequantization from register of B into second pair of registers
//// 5. store (4) into fragment
//// 6. matmul aggregate into fragment C
//// 7. aggreecate files of C into shared memroy block C
//// 7. aggreecate files of C into shared memory block C
//// 8. sum (7)
//// 9. write outputs to matmul output matrix
//}
Expand Down
44 changes: 22 additions & 22 deletions deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ echo "MAKE SURE LD_LIBRARY_PATH IS EMPTY!"
echo $LD_LIBRARY_PATH

if [[ ! -z "${LD_LIBRARY_PATH}" ]]; then
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

Expand All @@ -24,7 +24,7 @@ make cpuonly CUDA_VERSION="CPU"

if [ ! -f "./bitsandbytes/libbitsandbytes_cpu.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

Expand All @@ -34,7 +34,7 @@ make cuda110 CUDA_VERSION=110

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

Expand All @@ -44,7 +44,7 @@ make cuda11x CUDA_VERSION=111

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

Expand All @@ -54,7 +54,7 @@ make cuda11x CUDA_VERSION=114

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

Expand All @@ -64,7 +64,7 @@ make cuda11x CUDA_VERSION=115

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

Expand All @@ -74,7 +74,7 @@ make cuda11x CUDA_VERSION=117

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

Expand All @@ -84,7 +84,7 @@ make cuda118 CUDA_VERSION=118

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda118.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

Expand All @@ -94,7 +94,7 @@ make cuda12x CUDA_VERSION=120

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda120.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

Expand All @@ -104,7 +104,7 @@ make cuda12x CUDA_VERSION=121

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

Expand All @@ -114,7 +114,7 @@ make cuda12x CUDA_VERSION=122

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

Expand All @@ -124,7 +124,7 @@ make cuda12x CUDA_VERSION=123

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda123.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

Expand All @@ -138,7 +138,7 @@ make cuda110_nomatmul CUDA_VERSION=110

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda110_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

Expand All @@ -149,7 +149,7 @@ make cuda11x_nomatmul CUDA_VERSION=111

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda111_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

Expand All @@ -159,7 +159,7 @@ make cuda11x_nomatmul CUDA_VERSION=114

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda114_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

Expand All @@ -169,7 +169,7 @@ make cuda11x_nomatmul CUDA_VERSION=115

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda115_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

Expand All @@ -179,7 +179,7 @@ make cuda11x_nomatmul CUDA_VERSION=117

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda117_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

Expand All @@ -189,7 +189,7 @@ make cuda118_nomatmul CUDA_VERSION=118

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda118_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

Expand All @@ -199,7 +199,7 @@ make cuda12x_nomatmul CUDA_VERSION=120

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda120_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

Expand All @@ -209,7 +209,7 @@ make cuda12x_nomatmul CUDA_VERSION=121

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda121_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

Expand All @@ -219,7 +219,7 @@ make cuda12x_nomatmul CUDA_VERSION=122

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda122_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

Expand All @@ -229,7 +229,7 @@ make cuda12x_nomatmul CUDA_VERSION=123

if [ ! -f "./bitsandbytes/libbitsandbytes_cuda123_nocublaslt.so" ]; then
# Control will enter here if $DIRECTORY doesn't exist.
echo "Compilation unsuccessul!" 1>&2
echo "Compilation unsuccessful!" 1>&2
exit 64
fi

Expand Down
4 changes: 2 additions & 2 deletions howto_config_override.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ Possible options for the config override are: `betas, eps, weight_decay, lr, opt
For overrides for particular layers we recommend overriding locally in each module. You can do this by passing the module, the parameter, and its attribute name to the GlobalOptimManager:
```python
class MyModule(torch.nn.Module):
def __init__(din, dout):
def __init__(din, doubt):
super(MyModule, self).__init__()
self.linear = torch.nn.Linear(din, dout)
self.linear = torch.nn.Linear(din, doubt)
# optimization will happen in 32-bit and
# learning rate will be set to 0.0001 independent of the main learning rate
config = {'optim_bits': 32, 'lr' : 0.0001}
Expand Down
2 changes: 1 addition & 1 deletion include/Algo-Direct2.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
FVec<AVX, float> vxp = _mm256_i32gather_ps(xi, idxp, sizeof(float));
IVec<AVX, float> ip = idxm;

#else // do not use gather instrucions
#else // do not use gather instructions

union U {
__m256i vec;
Expand Down
2 changes: 1 addition & 1 deletion include/Portable.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,5 +147,5 @@ inline T prev(T x)
return x;
}

} // namepsace Details
} // namespace Details
} // namespace BinSearch
2 changes: 1 addition & 1 deletion include/SIMD.h
Original file line number Diff line number Diff line change
Expand Up @@ -568,5 +568,5 @@ FORCE_INLINE FVec<AVX, double> mulSub(const FVec<AVX, double>& a, const FVec<AVX

#endif

} // namepsace Details
} // namespace Details
} // namespace BinSearch
9 changes: 8 additions & 1 deletion tests/test_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,14 @@ def test_linear8bitlt_no_fp16_weights(threshold, memory_efficient_backward):
assert (idx == 0).sum().item() <= b1.numel() * 0.005


@pytest.mark.parametrize("module", [lambda nin, nout, bias=True: bnb.nn.Linear8bitLt(nin, nout, bias=bias, has_fp16_weights=False), bnb.nn.LinearFP4], ids=['Int8Lt', 'FP4'])
@pytest.mark.parametrize(
"module",
[
lambda n_in, n_out, bias=True: bnb.nn.Linear8bitLt(n_in, n_out, bias=bias, has_fp16_weights=False),
bnb.nn.LinearFP4,
],
ids=['Int8Lt', 'FP4'],
)
def test_linear_kbit_fp32_bias(module):
# casts model to fp16 -> int8 automatically
l1 = module(32, 64).cuda()
Expand Down

0 comments on commit 7d81b19

Please sign in to comment.