From d1b146bbc4c6659f07b7917b693a46a9f5c28570 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Fri, 15 Mar 2024 22:57:48 -0400 Subject: [PATCH 1/8] support galore once upstreamed into transformers --- README.md | 19 +++++++++++++++++++ src/axolotl/core/trainer_builder.py | 7 +++++++ .../config/models/input/v0_4_1/__init__.py | 9 +++++++++ 3 files changed, 35 insertions(+) diff --git a/README.md b/README.md index 826c16045c..9b6e9158d6 100644 --- a/README.md +++ b/README.md @@ -883,7 +883,26 @@ lr_div_factor: # Learning rate div factor # - paged_adamw_8bit # - paged_lion_32bit # - paged_lion_8bit +# - galore_adamw +# - galore_adamw_8bit +# - galore_adafactor +# - galore_adamw_layerwise +# - galore_adamw_8bit_layerwise +# - galore_adafactor_layerwise optimizer: +# Dictionary of arguments to pass to the optimizer +optim_args: +# For Galore Optimizers the following optim_args are available +# rank: # type: int +# update_proj_gap # type: int +# scale # type: float +# proj_type: # type: str, default = std + +# The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm +optim_target_modules: +# - attn +# - mlp + # Specify weight decay weight_decay: # adamw hyperparams diff --git a/src/axolotl/core/trainer_builder.py b/src/axolotl/core/trainer_builder.py index d11f0c6532..d74665b18a 100644 --- a/src/axolotl/core/trainer_builder.py +++ b/src/axolotl/core/trainer_builder.py @@ -232,6 +232,7 @@ def create_optimizer(self): if self.optimizer is None: # pylint: disable=access-member-before-definition optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs( self.args, + opt_model, ) loraplus_lr_ratio = getattr(self.args, "loraplus_lr_ratio", None) @@ -1016,6 +1017,12 @@ def build(self, total_num_steps): training_arguments_kwargs["optim"] = ( self.cfg.optimizer if self.cfg.optimizer else "adamw_hf" ) + if self.cfg.optim_args: + training_arguments_kwargs["optim_args"] = self.cfg.optim_args + if self.cfg.optim_target_modules: + training_arguments_kwargs[ + "optim_target_modules" + ] = self.cfg.optim_target_modules training_arguments_kwargs["loraplus_lr_ratio"] = self.cfg.loraplus_lr_ratio training_arguments_kwargs[ "loraplus_lr_embedding" diff --git a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py index dfe9a9be96..02c5e44d1a 100644 --- a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py +++ b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py @@ -310,6 +310,15 @@ class HyperparametersConfig(BaseModel): learning_rate: Union[str, float] weight_decay: Optional[float] = None optimizer: Optional[Union[OptimizerNames, Literal["lion_pytorch"]]] = None + optim_args: Optional[str] = Field( + default=None, metadata={"help": "Optional arguments to supply to optimizer."} + ) + optim_target_modules: Optional[List[str]] = Field( + default=None, + metadata={ + "help": "The target modules to optimize, i.e. the module names that you would like to train." + }, + ) torchdistx_path: Optional[str] = None lr_scheduler: Optional[SchedulerType] = None lr_scheduler_kwargs: Optional[Dict[str, Any]] = None From f4686778f9af941fdb7fb41d8c49258a332229a1 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Fri, 15 Mar 2024 23:30:04 -0400 Subject: [PATCH 2/8] update module name for llama in readme and fix typing for all linear --- README.md | 2 +- src/axolotl/utils/config/models/input/v0_4_1/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9b6e9158d6..e2c7fb6b2f 100644 --- a/README.md +++ b/README.md @@ -900,7 +900,7 @@ optim_args: # The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm optim_target_modules: -# - attn +# - self_attn # for llama # - mlp # Specify weight decay diff --git a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py index 02c5e44d1a..971bd78207 100644 --- a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py +++ b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py @@ -313,7 +313,7 @@ class HyperparametersConfig(BaseModel): optim_args: Optional[str] = Field( default=None, metadata={"help": "Optional arguments to supply to optimizer."} ) - optim_target_modules: Optional[List[str]] = Field( + optim_target_modules: Optional[Union[List[str], Literal["all_linear"]]] = Field( default=None, metadata={ "help": "The target modules to optimize, i.e. the module names that you would like to train." From 558880e19d5118ea80f2a43b5d60c3326c6502af Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Fri, 15 Mar 2024 23:31:53 -0400 Subject: [PATCH 3/8] bump trl for deprecation fixes from newer transformers --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 191948a400..6f6ec310fc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -39,5 +39,5 @@ s3fs gcsfs # adlfs -trl>=0.7.9 +trl @ git+https://github.com/huggingface/trl.git@304e208f778a5442c30cdda500348226cdc97d90 fastcore>=1.5.29 From 59f593826f4378287ff4c66100724592b01cf8ac Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Fri, 15 Mar 2024 23:40:40 -0400 Subject: [PATCH 4/8] include galore as an extra and install in docker image --- docker/Dockerfile | 4 ++-- setup.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 62904af722..84046c6522 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -21,9 +21,9 @@ WORKDIR /workspace/axolotl # If AXOLOTL_EXTRAS is set, append it in brackets RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \ - pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \ + pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \ else \ - pip install -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \ + pip install -e .[deepspeed,flash-attn,mamba-ssm,galore] $AXOLOTL_ARGS; \ fi # So we can test the Docker image diff --git a/setup.py b/setup.py index 40dd0a6686..cbed45afe8 100644 --- a/setup.py +++ b/setup.py @@ -89,5 +89,6 @@ def parse_requirements(): "lion-pytorch": [ "lion-pytorch==0.1.2", ], + "galore": ["galore_torch @ git+https://github.com/jiaweizzhao/GaLore"], }, ) From 974933d620f4a3b990786a74a8eb59d6f73b571d Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 16 Mar 2024 00:06:44 -0400 Subject: [PATCH 5/8] fix optim_args type --- src/axolotl/utils/config/models/input/v0_4_1/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py index 971bd78207..4619ace548 100644 --- a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py +++ b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py @@ -310,7 +310,7 @@ class HyperparametersConfig(BaseModel): learning_rate: Union[str, float] weight_decay: Optional[float] = None optimizer: Optional[Union[OptimizerNames, Literal["lion_pytorch"]]] = None - optim_args: Optional[str] = Field( + optim_args: Optional[Dict[str, Any]] = Field( default=None, metadata={"help": "Optional arguments to supply to optimizer."} ) optim_target_modules: Optional[Union[List[str], Literal["all_linear"]]] = Field( From dc45927cda2ed6b7b53ae3baf2d3accd306be1e7 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 16 Mar 2024 00:12:29 -0400 Subject: [PATCH 6/8] fix optim_args --- src/axolotl/core/trainer_builder.py | 10 ++++++++-- .../utils/config/models/input/v0_4_1/__init__.py | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/axolotl/core/trainer_builder.py b/src/axolotl/core/trainer_builder.py index d74665b18a..0980e4358b 100644 --- a/src/axolotl/core/trainer_builder.py +++ b/src/axolotl/core/trainer_builder.py @@ -216,7 +216,7 @@ def __init__( num_epochs=1, bench_data_collator=None, eval_data_collator=None, - **kwargs + **kwargs, ): self.num_epochs = num_epochs self.bench_data_collator = bench_data_collator @@ -1018,7 +1018,13 @@ def build(self, total_num_steps): self.cfg.optimizer if self.cfg.optimizer else "adamw_hf" ) if self.cfg.optim_args: - training_arguments_kwargs["optim_args"] = self.cfg.optim_args + if isinstance(self.cfg.optim_args, dict): + optim_args = ",".join( + [f"{key}={value}" for key, value in self.cfg.optim_args.items()] + ) + else: + optim_args = self.cfg.optim_args + training_arguments_kwargs["optim_args"] = optim_args if self.cfg.optim_target_modules: training_arguments_kwargs[ "optim_target_modules" diff --git a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py index 4619ace548..a8d3382b31 100644 --- a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py +++ b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py @@ -310,7 +310,7 @@ class HyperparametersConfig(BaseModel): learning_rate: Union[str, float] weight_decay: Optional[float] = None optimizer: Optional[Union[OptimizerNames, Literal["lion_pytorch"]]] = None - optim_args: Optional[Dict[str, Any]] = Field( + optim_args: Optional[Union[str, Dict[str, Any]]] = Field( default=None, metadata={"help": "Optional arguments to supply to optimizer."} ) optim_target_modules: Optional[Union[List[str], Literal["all_linear"]]] = Field( From 9437b48e181c12040170dcfaf56a93aceb4b81a1 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Tue, 19 Mar 2024 08:43:23 -0400 Subject: [PATCH 7/8] update dependencies for galore --- requirements.txt | 2 +- setup.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6f6ec310fc..aaa27c547b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ packaging==23.2 peft==0.9.0 -transformers==4.38.2 +transformers @ git+https://github.com/huggingface/transformers.git@f6261d7d81edd036fc53bfede65fe91f01a661aa tokenizers==0.15.0 bitsandbytes>=0.43.0 accelerate==0.26.1 diff --git a/setup.py b/setup.py index cbed45afe8..307691bd49 100644 --- a/setup.py +++ b/setup.py @@ -89,6 +89,8 @@ def parse_requirements(): "lion-pytorch": [ "lion-pytorch==0.1.2", ], - "galore": ["galore_torch @ git+https://github.com/jiaweizzhao/GaLore"], + "galore": [ + "galore_torch", + ], }, ) From 3ce3b2b0949972f8ae0c9d4173f37a0b0bb9f1b9 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Tue, 19 Mar 2024 08:52:51 -0400 Subject: [PATCH 8/8] add galore to cicd dockerfile --- cicd/Dockerfile.jinja | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cicd/Dockerfile.jinja b/cicd/Dockerfile.jinja index a4784707c5..19d7a09de5 100644 --- a/cicd/Dockerfile.jinja +++ b/cicd/Dockerfile.jinja @@ -23,9 +23,9 @@ RUN git fetch origin +$GITHUB_REF && \ # If AXOLOTL_EXTRAS is set, append it in brackets RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \ - pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \ + pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \ else \ - pip install -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \ + pip install -e .[deepspeed,flash-attn,mamba-ssm,galore] $AXOLOTL_ARGS; \ fi # So we can test the Docker image