diff --git a/bitsandbytes/optim/adagrad.py b/bitsandbytes/optim/adagrad.py index 98d228a85..c2ea87ab0 100644 --- a/bitsandbytes/optim/adagrad.py +++ b/bitsandbytes/optim/adagrad.py @@ -35,7 +35,7 @@ def __init__( initial_accumulator_value (`int`, defaults to 0): The initial momemtum values. eps (`float`, defaults to 1e-10): - The epsilon value for the optimizer. + The epsilon value prevents division by zero in the optimizer. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. args (`dict`, defaults to `None`): @@ -104,7 +104,7 @@ def __init__( initial_accumulator_value (`int`, defaults to 0): The initial momemtum values. eps (`float`, defaults to 1e-10): - The epsilon value for the optimizer. + The epsilon value prevents division by zero in the optimizer. optim_bits (`int`, defaults to 8): The number of bits of the optimizer state. args (`dict`, defaults to `None`): @@ -174,7 +174,7 @@ def __init__( initial_accumulator_value (`int`, defaults to 0): The initial momemtum values. eps (`float`, defaults to 1e-10): - The epsilon value for the optimizer. + The epsilon value prevents division by zero in the optimizer. optim_bits (`int`, defaults to 32): The number of bits of the optimizer state. args (`dict`, defaults to `None`): @@ -185,7 +185,7 @@ def __init__( Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. block_wise (`bool`, defaults to `True`): Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. - """ + """ if not 0.0 <= lr: raise ValueError(f"Invalid learning rate: {lr}") if not 0.0 <= weight_decay: diff --git a/bitsandbytes/optim/adam.py b/bitsandbytes/optim/adam.py index 86981eb86..e534c8b8f 100644 --- a/bitsandbytes/optim/adam.py +++ b/bitsandbytes/optim/adam.py @@ -16,31 +16,205 @@ class Adam(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + Base Adam optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) class Adam8bit(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + 8-bit Adam optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) class Adam32bit(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + 32-bit Adam optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) class PagedAdam(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + Paged Adam optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) class PagedAdam8bit(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + 8-bit paged Adam optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) class PagedAdam32bit(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + Paged 32-bit Adam optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) class AnalysisAdam(torch.optim.Optimizer): diff --git a/bitsandbytes/optim/adamw.py b/bitsandbytes/optim/adamw.py index 17383eed5..1e2dc04de 100644 --- a/bitsandbytes/optim/adamw.py +++ b/bitsandbytes/optim/adamw.py @@ -8,30 +8,204 @@ class AdamW(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + Base AdamW optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged ) class AdamW8bit(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + 8-bit AdamW optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged ) class AdamW32bit(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + 32-bit AdamW optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) class PagedAdamW(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + """ + Paged AdamW optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) class PagedAdamW8bit(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + """ + Paged 8-bit AdamW optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) class PagedAdamW32bit(Optimizer2State): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + """ + Paged 32-bit AdamW optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) diff --git a/bitsandbytes/optim/lamb.py b/bitsandbytes/optim/lamb.py index 1fbb6fadc..ec829ee85 100644 --- a/bitsandbytes/optim/lamb.py +++ b/bitsandbytes/optim/lamb.py @@ -23,6 +23,39 @@ def __init__( block_wise=False, max_unorm=1.0, ): + """ + Base LAMB optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + bias_correction (`bool`, defaults to `True`): + Whether to apply bias correction to the first and second-order moments. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + adam_w_mode (`bool`, defaults to `True`): + Whether to use the AdamW variant. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + max_unorm (`float`, defaults to 1.0): + The maximum gradient norm. + """ super().__init__( "lamb", params, @@ -56,6 +89,37 @@ def __init__( block_wise=False, max_unorm=1.0, ): + """ + 8-bit LAMB optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + bias_correction (`bool`, defaults to `True`): + Whether to apply bias correction to the first and second-order moments. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + adam_w_mode (`bool`, defaults to `True`): + Whether to use the AdamW variant. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + max_unorm (`float`, defaults to 1.0): + The maximum gradient norm. + """ super().__init__( "lamb", params, @@ -89,6 +153,37 @@ def __init__( block_wise=False, max_unorm=1.0, ): + """ + 32-bit LAMB optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-3): + The learning rate. + bias_correction (`bool`, defaults to `True`): + Whether to apply bias correction to the first and second-order moments. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + amsgrad (`bool`, defaults to `False`): + Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead. + adam_w_mode (`bool`, defaults to `True`): + Whether to use the AdamW variant. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + max_unorm (`float`, defaults to 1.0): + The maximum gradient norm. + """ super().__init__( "lamb", params, diff --git a/bitsandbytes/optim/lars.py b/bitsandbytes/optim/lars.py index 73554e3cc..7449b805b 100644 --- a/bitsandbytes/optim/lars.py +++ b/bitsandbytes/optim/lars.py @@ -23,6 +23,33 @@ def __init__( percentile_clipping=100, max_unorm=0.02, ): + """ + Base LARS optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`): + The learning rate. + momentum (`float`, defaults to 0): + The momentum value speeds up the optimizer by taking bigger steps. + dampening (`float`, defaults to 0): + The dampening value reduces the momentum of the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + nesterov (`bool`, defaults to `False`): + Whether to use Nesterov momentum. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + max_unorm (`float`, defaults to 0.02): + The maximum gradient norm. + """ if momentum == 0: raise NotImplementedError( "LARS without momentum is not supported!" @@ -57,6 +84,31 @@ def __init__( percentile_clipping=100, max_unorm=0.02, ): + """ + 8-bit LARS optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`): + The learning rate. + momentum (`float`, defaults to 0): + The momentum value speeds up the optimizer by taking bigger steps. + dampening (`float`, defaults to 0): + The dampening value reduces the momentum of the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + nesterov (`bool`, defaults to `False`): + Whether to use Nesterov momentum. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + max_unorm (`float`, defaults to 0.02): + The maximum gradient norm. + """ if momentum == 0: raise NotImplementedError( "LARS without momentum is not supported!" @@ -91,6 +143,31 @@ def __init__( percentile_clipping=100, max_unorm=0.02, ): + """ + 32-bit LARS optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`): + The learning rate. + momentum (`float`, defaults to 0): + The momentum value speeds up the optimizer by taking bigger steps. + dampening (`float`, defaults to 0): + The dampening value reduces the momentum of the optimizer. + weight_decay (`float`, defaults to 1e-2): + The weight decay value for the optimizer. + nesterov (`bool`, defaults to `False`): + Whether to use Nesterov momentum. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + max_unorm (`float`, defaults to 0.02): + The maximum gradient norm. + """ if momentum == 0: raise NotImplementedError( "LARS without momentum is not supported!" diff --git a/bitsandbytes/optim/lion.py b/bitsandbytes/optim/lion.py index b6ba4a9f1..ce185f863 100644 --- a/bitsandbytes/optim/lion.py +++ b/bitsandbytes/optim/lion.py @@ -7,25 +7,165 @@ class Lion(Optimizer1State): def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + Base Lion optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-4): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + weight_decay (`float`, defaults to 0): + The weight decay value for the optimizer. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__("lion", params, lr, betas, 0., weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) class Lion8bit(Optimizer1State): def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + 8-bit Lion optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-4): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + weight_decay (`float`, defaults to 0): + The weight decay value for the optimizer. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__("lion", params, lr, betas, 0., weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) class Lion32bit(Optimizer1State): def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False): + """ + 32-bit Lion optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-4): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + weight_decay (`float`, defaults to 0): + The weight decay value for the optimizer. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + is_paged (`bool`, defaults to `False`): + Whether the optimizer is a paged optimizer or not. + """ super().__init__("lion", params, lr, betas, 0., weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged) class PagedLion(Optimizer1State): def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + """ + Paged Lion optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-4): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + weight_decay (`float`, defaults to 0): + The weight decay value for the optimizer. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + """ super().__init__("lion", params, lr, betas, 0., weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) class PagedLion8bit(Optimizer1State): def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + """ + Paged 8-bit Lion optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-4): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + weight_decay (`float`, defaults to 0): + The weight decay value for the optimizer. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + """ super().__init__("lion", params, lr, betas, 0., weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) class PagedLion32bit(Optimizer1State): def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True): + """ + Paged 32-bit Lion optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-4): + The learning rate. + betas (`tuple(float, float)`, defaults to (0.9, 0.999)): + The beta values are the decay rates of the first and second-order moment of the optimizer. + weight_decay (`float`, defaults to 0): + The weight decay value for the optimizer. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + """ super().__init__("lion", params, lr, betas, 0., weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True) diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py index c21b3d7d3..a97afb026 100644 --- a/bitsandbytes/optim/optimizer.py +++ b/bitsandbytes/optim/optimizer.py @@ -59,8 +59,8 @@ def override_config( Override initial optimizer config with specific hyperparameters. The key-values of the optimizer config for the input parameters are overridden - This can be both, optimizer parameters like "betas" or "lr", or it can be - 8-bit specific parameters like "optim_bits" or "percentile_clipping". + This can be both, optimizer parameters like `betas` or `lr`, or it can be + 8-bit specific parameters like `optim_bits` or `percentile_clipping`. Arguments: parameters (`torch.Tensor` or `list(torch.Tensors)`): @@ -160,7 +160,7 @@ def load_state_dict(self, state_dict): """Load an optimizer state. Arguments: - state_dict (`dict`): + state_dict (`dict`): An optimizer state (should be returned from a call to `state_dict`) to load. """ # deepcopy, to be consistent with module API @@ -272,7 +272,7 @@ def step(self, closure=None): """Perform a single optimization step. Arguments: - closure (`Callable`, *optional*, defaults to `None`): + closure (`Callable`, *optional*, defaults to `None`): A closure that reevaluates the model and returns the loss. """ loss = None @@ -648,7 +648,7 @@ def __init__( skip_zeros (`bool`, defaults to `False`): Whether to skip zero values for sparse gradients and models to ensure correct updates. is_paged (`bool`, defaults to `False`): - Whether the optimizer is a paged optimizer or not. + Whether the optimizer is a paged optimizer or not. """ if not 0.0 <= lr: raise ValueError(f"Invalid learning rate: {lr}") diff --git a/bitsandbytes/optim/rmsprop.py b/bitsandbytes/optim/rmsprop.py index 2853ca723..ac371a66f 100644 --- a/bitsandbytes/optim/rmsprop.py +++ b/bitsandbytes/optim/rmsprop.py @@ -21,6 +21,35 @@ def __init__( percentile_clipping=100, block_wise=True, ): + """ + Base RMSprop optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-2): + The learning rate. + alpha (`float`, defaults to 0.99): + The alpha value is the decay rate of the squared gradients of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + momentum (`float`, defaults to 0): + The momentum value speeds up the optimizer by taking bigger steps. + centered (`bool`, defaults to `False`): + Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + """ if alpha == 0: raise NotImplementedError( "RMSprop with alpha==0.0 is not supported!" @@ -57,6 +86,35 @@ def __init__( percentile_clipping=100, block_wise=True, ): + """ + 8-bit RMSprop optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-2): + The learning rate. + alpha (`float`, defaults to 0.99): + The alpha value is the decay rate of the squared gradients of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + momentum (`float`, defaults to 0): + The momentum value speeds up the optimizer by taking bigger steps. + centered (`bool`, defaults to `False`): + Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + """ if alpha == 0: raise NotImplementedError( "RMSprop with alpha==0.0 is not supported!" @@ -93,6 +151,35 @@ def __init__( percentile_clipping=100, block_wise=True, ): + """ + 32-bit RMSprop optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`, defaults to 1e-2): + The learning rate. + alpha (`float`, defaults to 0.99): + The alpha value is the decay rate of the squared gradients of the optimizer. + eps (`float`, defaults to 1e-8): + The epsilon value prevents division by zero in the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + momentum (`float`, defaults to 0): + The momentum value speeds up the optimizer by taking bigger steps. + centered (`bool`, defaults to `False`): + Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + """ if alpha == 0: raise NotImplementedError( diff --git a/bitsandbytes/optim/sgd.py b/bitsandbytes/optim/sgd.py index 3c0fc2b9f..0f0b12e4b 100644 --- a/bitsandbytes/optim/sgd.py +++ b/bitsandbytes/optim/sgd.py @@ -20,6 +20,33 @@ def __init__( percentile_clipping=100, block_wise=True, ): + """ + Base SGD optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`): + The learning rate. + momentum (`float`, defaults to 0): + The momentum value speeds up the optimizer by taking bigger steps. + dampening (`float`, defaults to 0): + The dampening value reduces the momentum of the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + nesterov (`bool`, defaults to `False`): + Whether to use Nesterov momentum. + optim_bits (`int`, defaults to 32): + The number of bits of the optimizer state. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + """ if momentum == 0: raise NotImplementedError("SGD without momentum is not supported!") super().__init__( @@ -51,6 +78,31 @@ def __init__( percentile_clipping=100, block_wise=True, ): + """ + 8-bit SGD optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`): + The learning rate. + momentum (`float`, defaults to 0): + The momentum value speeds up the optimizer by taking bigger steps. + dampening (`float`, defaults to 0): + The dampening value reduces the momentum of the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + nesterov (`bool`, defaults to `False`): + Whether to use Nesterov momentum. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + """ if momentum == 0: raise NotImplementedError("SGD without momentum is not supported!") super().__init__( @@ -82,6 +134,31 @@ def __init__( percentile_clipping=100, block_wise=True, ): + """ + 32-bit SGD optimizer. + + Arguments: + params (`torch.tensor`): + The input parameters to optimize. + lr (`float`): + The learning rate. + momentum (`float`, defaults to 0): + The momentum value speeds up the optimizer by taking bigger steps. + dampening (`float`, defaults to 0): + The dampening value reduces the momentum of the optimizer. + weight_decay (`float`, defaults to 0.0): + The weight decay value for the optimizer. + nesterov (`bool`, defaults to `False`): + Whether to use Nesterov momentum. + args (`dict`, defaults to `None`): + A dictionary with additional arguments. + min_8bit_size (`int`, defaults to 4096): + The minimum number of elements of the parameter tensors for 8-bit optimization. + percentile_clipping (`int`, defaults to 100): + Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability. + block_wise (`bool`, defaults to `True`): + Whether to independently quantize each block of tensors to reduce outlier effects and improve stability. + """ if momentum == 0: raise NotImplementedError("SGD without momentum is not supported!") super().__init__( diff --git a/docs/source/reference/optim/adagrad.mdx b/docs/source/reference/optim/adagrad.mdx index 7bc0f3040..8dddba04c 100644 --- a/docs/source/reference/optim/adagrad.mdx +++ b/docs/source/reference/optim/adagrad.mdx @@ -1,17 +1,18 @@ # AdaGrad -[AdaGrad (Adaptive Gradient)](https://jmlr.org/papers/v12/duchi11a.html) is an optimizer that adaptively adjusts the learning rate for each parameter based on their historical gradients. +[AdaGrad (Adaptive Gradient)](https://jmlr.org/papers/v12/duchi11a.html) is an adaptive learning rate optimizer. AdaGrad stores a sum of the squared past gradients for each parameter and uses it to scale their learning rate. This allows the learning rate to be automatically lower or higher depending on the magnitude of the gradient, eliminating the need to manually tune the learning rate. -* Parameters with larger gradients are updated with smaller learning rates to avoid overshooting the minimum. -* Parameters with smaller gradients are updated with larger learning rates to catch up and converge faster. - -Since learning rates are automatically adjusted, AdaGrad does not require manually tuning learning rates. +## Adagrad[[api-class]] [[autodoc]] bitsandbytes.optim.Adagrad - __init__ +## Adagrad8bit + [[autodoc]] bitsandbytes.optim.Adagrad8bit - __init__ +## Adagrad32bit + [[autodoc]] bitsandbytes.optim.Adagrad32bit - __init__ diff --git a/docs/source/reference/optim/adam.mdx b/docs/source/reference/optim/adam.mdx index 0bb12ca80..f367bc415 100644 --- a/docs/source/reference/optim/adam.mdx +++ b/docs/source/reference/optim/adam.mdx @@ -1,15 +1,38 @@ # Adam +[Adam (Adaptive moment estimation)](https://hf.co/papers/1412.6980) is an adaptive learning rate optimizer, combining ideas from [`SGD`] with momentum and [`RMSprop`] to automatically scale the learning rate: + +- a weighted average of the past gradients to provide direction (first-moment) +- a weighted average of the *squared* past gradients to adapt the learning rate to each parameter (second-moment) + +bitsandbytes also supports paged optimizers which take advantage of CUDAs unified memory to transfer memory from the GPU to the CPU when GPU memory is exhausted. + +## Adam[[api-class]] + [[autodoc]] bitsandbytes.optim.Adam + - __init__ + +## Adam8bit [[autodoc]] bitsandbytes.optim.Adam8bit + - __init__ + +## Adam32bit [[autodoc]] bitsandbytes.optim.Adam32bit + - __init__ -## Paged Adam +## PagedAdam [[autodoc]] bitsandbytes.optim.PagedAdam + - __init__ + +## PagedAdam8bit [[autodoc]] bitsandbytes.optim.PagedAdam8bit + - __init__ + +## PagedAdam32bit [[autodoc]] bitsandbytes.optim.PagedAdam32bit + - __init__ diff --git a/docs/source/reference/optim/adamw.mdx b/docs/source/reference/optim/adamw.mdx index 9e85716df..e3dd410de 100644 --- a/docs/source/reference/optim/adamw.mdx +++ b/docs/source/reference/optim/adamw.mdx @@ -1,15 +1,34 @@ # AdamW +[AdamW](https://hf.co/papers/1711.05101) is a variant of the [`Adam`] optimizer that separates weight decay from the gradient update based on the observation that the weight decay formulation is different when applied to [`SGD`] and [`Adam`]. + +bitsandbytes also supports paged optimizers which take advantage of CUDAs unified memory to transfer memory from the GPU to the CPU when GPU memory is exhausted. + +## AdamW[[api-class]] + [[autodoc]] bitsandbytes.optim.AdamW + - __init__ + +## AdamW8bit [[autodoc]] bitsandbytes.optim.AdamW8bit + - __init__ + +## AdamW32bit [[autodoc]] bitsandbytes.optim.AdamW32bit + - __init__ -## Paged AdamW +## PagedAdamW [[autodoc]] bitsandbytes.optim.PagedAdamW + - __init__ +## PagedAdamW8bit [[autodoc]] bitsandbytes.optim.PagedAdamW8bit + - __init__ + +## PagedAdamW32bit [[autodoc]] bitsandbytes.optim.PagedAdamW32bit + - __init__ diff --git a/docs/source/reference/optim/lamb.mdx b/docs/source/reference/optim/lamb.mdx index ab583ed91..d581380ed 100644 --- a/docs/source/reference/optim/lamb.mdx +++ b/docs/source/reference/optim/lamb.mdx @@ -1,7 +1,21 @@ # LAMB +[LAMB (Layerwise adaptive large batch optimization)](https://hf.co/papers/1904.00962) is an adaptive optimizer designed for training with large batch sizes to accelerate training, combining ideas from [`LARS`] and [`Adam`] to automatically scale the learning rate for each layer: + +- calculates a *trust ratio* between the weight and gradient norm in a layer and clips the ratio to prevent overly large or small updates +- updates weights with the first and second-moments + +## LAMB[[api-class]] + [[autodoc]] bitsandbytes.optim.LAMB + - __init__ + +## LAMB8bit [[autodoc]] bitsandbytes.optim.LAMB8bit + - __init__ + +## LAMB32bit [[autodoc]] bitsandbytes.optim.LAMB32bit + - __init__ diff --git a/docs/source/reference/optim/lars.mdx b/docs/source/reference/optim/lars.mdx index b5dde29d0..93b5c55c3 100644 --- a/docs/source/reference/optim/lars.mdx +++ b/docs/source/reference/optim/lars.mdx @@ -1,7 +1,18 @@ # LARS +[LARS (Layer-wise Adaptive Rate Scaling)](https:/hf.co/papers/1708.03888) is an optimizer designed for training with large batch sizes to accelerate training. LARS uses a separate learning rate for each *layer* instead of each parameter. The learning rate is calculated from a *trust ratio* between the weight and gradient norm in a layer. This helps calibrate a stable update size. + +## LARS[[api-class]] + [[autodoc]] bitsandbytes.optim.LARS + - __init__ + +## LARS8bit [[autodoc]] bitsandbytes.optim.LARS8bit + - __init__ + +## LARS32bit [[autodoc]] bitsandbytes.optim.LARS32bit + - __init__ diff --git a/docs/source/reference/optim/lion.mdx b/docs/source/reference/optim/lion.mdx index a9f849a3e..8183c27e7 100644 --- a/docs/source/reference/optim/lion.mdx +++ b/docs/source/reference/optim/lion.mdx @@ -1,15 +1,33 @@ # Lion +[Lion (Evolved Sign Momentum)](https://hf.co/papers/2302.06675) is a unique optimizer that uses the sign of the gradient to determine the update direction of the momentum. This makes Lion more memory-efficient and faster than [`AdamW`] which tracks and store the first and second-order moments. + +## Lion[[api-class]] + [[autodoc]] bitsandbytes.optim.Lion + - __init__ + +## Lion8bit [[autodoc]] bitsandbytes.optim.Lion8bit + - __init__ + +## Lion32bit [[autodoc]] bitsandbytes.optim.Lion32bit + - __init__ -## Paged Lion +## PagedLion [[autodoc]] bitsandbytes.optim.PagedLion + - __init__ + +## PagedLion8bit [[autodoc]] bitsandbytes.optim.PagedLion8bit + - __init__ + +## PagedLion32bit [[autodoc]] bitsandbytes.optim.PagedLion32bit + - __init__ diff --git a/docs/source/reference/optim/optim_overview.mdx b/docs/source/reference/optim/optim_overview.mdx index a4b5482a8..48e12b544 100644 --- a/docs/source/reference/optim/optim_overview.mdx +++ b/docs/source/reference/optim/optim_overview.mdx @@ -1,11 +1,21 @@ # Overview +[8-bit optimizers](https://hf.co/papers/2110.02861) reduce the memory footprint of 32-bit optimizers without any performance degradation which means you can train large models with many parameters faster. At the core of 8-bit optimizers is block-wise quantization which enables quantization accuracy, computational efficiency, and stability. + +bitsandbytes provides 8-bit optimizers through the base [`Optimizer8bit`] class, and additionally provides [`Optimizer2State`] and [`Optimizer1State`] for 2-state (for example, [`Adam`]) and 1-state (for example, [`Adagrad`]) optimizers respectively. To provide custom optimizer hyperparameters, use the [`GlobalOptimManager`] class to configure the optimizer. + +## Optimizer8bit + [[autodoc]] bitsandbytes.optim.optimizer.Optimizer8bit - __init__ +## Optimizer2State + [[autodoc]] bitsandbytes.optim.optimizer.Optimizer2State - __init__ +## Optimizer1State + [[autodoc]] bitsandbytes.optim.optimizer.Optimizer1State - __init__ diff --git a/docs/source/reference/optim/rmsprop.mdx b/docs/source/reference/optim/rmsprop.mdx index 2ecb7f579..33d839f6b 100644 --- a/docs/source/reference/optim/rmsprop.mdx +++ b/docs/source/reference/optim/rmsprop.mdx @@ -1,7 +1,15 @@ # RMSprop +RMSprop is an adaptive learning rate optimizer that is very similar to [`Adagrad`]. RMSprop stores a *weighted average* of the squared past gradients for each parameter and uses it to scale their learning rate. This allows the learning rate to be automatically lower or higher depending on the magnitude of the gradient, and it prevents the learning rate from diminishing. + +## RMSprop[[api-class]] + [[autodoc]] bitsandbytes.optim.RMSprop +## RMSprop8bit + [[autodoc]] bitsandbytes.optim.RMSprop8bit +## RMSprop32bit + [[autodoc]] bitsandbytes.optim.RMSprop32bit diff --git a/docs/source/reference/optim/sgd.mdx b/docs/source/reference/optim/sgd.mdx index 3c24edcd2..a0d09d1e8 100644 --- a/docs/source/reference/optim/sgd.mdx +++ b/docs/source/reference/optim/sgd.mdx @@ -1,7 +1,20 @@ # SGD +Stochastic gradient descent (SGD) is a basic gradient descent optimizer to minimize loss given a set of model parameters and updates the parameters in the opposite direction of the gradient. The update is performed on a randomly sampled mini-batch of data from the dataset. + +bitsandbytes also supports momentum and Nesterov momentum to accelerate SGD by adding a weighted average of past gradients to the current gradient. + +## SGD[[api-class]] + [[autodoc]] bitsandbytes.optim.SGD + - __init__ + +## SGD8bit [[autodoc]] bitsandbytes.optim.SGD8bit + - __init__ + +## SGD32bit [[autodoc]] bitsandbytes.optim.SGD32bit + - __init__