diff --git a/examples/llama-2/qlora-fsdp.yml b/examples/llama-2/qlora-fsdp.yml index 30916ed45a..274bd7312d 100644 --- a/examples/llama-2/qlora-fsdp.yml +++ b/examples/llama-2/qlora-fsdp.yml @@ -65,6 +65,7 @@ deepspeed: weight_decay: 0.0 fsdp: - full_shard + - auto_wrap fsdp_config: fsdp_limit_all_gathers: true fsdp_sync_module_states: true @@ -73,4 +74,5 @@ fsdp_config: fsdp_cpu_ram_efficient_loading: true fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer fsdp_state_dict_type: SHARDED_STATE_DICT + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP special_tokens: diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index 2de2c54cce..6625080755 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -198,7 +198,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True): .apply(lambda x: len(x)) # pylint: disable=unnecessary-lambda .values ) - LOG.debug(f"total_num_tokens: {total_num_tokens}", main_process_only=True) + LOG.debug(f"total_num_tokens: {total_num_tokens:_}", main_process_only=True) if update: cfg.total_num_tokens = total_num_tokens @@ -212,7 +212,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True): .sum() ) LOG.debug( - f"`total_supervised_tokens: {total_supervised_tokens}`", + f"`total_supervised_tokens: {total_supervised_tokens:_}`", main_process_only=True, ) if update: @@ -239,7 +239,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True): * cfg.num_epochs ) LOG.debug( - f"total_num_tokens: {cfg.total_num_tokens}, total_num_steps: {total_num_steps}", + f"total_num_tokens: {cfg.total_num_tokens:_}, total_num_steps: {total_num_steps:_}", main_process_only=True, ) else: