From 020a25fb86c1ec6a0ebce2398e91e4f7cc98d2fe Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Fri, 29 Nov 2024 11:00:12 +0800 Subject: [PATCH] Fix ckpt convert bug (#9521) * refine log * refine * refine * refine --- paddlenlp/trainer/utils/ckpt_converter.py | 4 ++-- scripts/distribute/ci_case_auto.sh | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/paddlenlp/trainer/utils/ckpt_converter.py b/paddlenlp/trainer/utils/ckpt_converter.py index dc1481f1f471..36cabd468f52 100644 --- a/paddlenlp/trainer/utils/ckpt_converter.py +++ b/paddlenlp/trainer/utils/ckpt_converter.py @@ -270,7 +270,7 @@ def gen_metadata_and_prepare_source_state_dict(self): malloc_size = 0 for opt_state_name, opt_state_value in optimizer_state_dict.items(): malloc_size += opt_state_value.numel() * opt_state_value.element_size() - malloc_size = malloc_size.numpy() / 2**20 + malloc_size = malloc_size / 2**20 logger.debug(f"{malloc_size} MB of GPU memory were allocated.") # merge sharding @@ -555,7 +555,7 @@ def load_state_dict_and_rename(self): for k, v in state_dict.items(): memory_size += v.numel() * v.element_size() - memory_size = memory_size.numpy() / 2**20 + memory_size = memory_size / 2**20 logger.debug( f"The current rank has finished loading the checkpoint file and has allocated {memory_size} MB of GPU memory." ) diff --git a/scripts/distribute/ci_case_auto.sh b/scripts/distribute/ci_case_auto.sh index 7c11fbad457e..f09cdc174b57 100755 --- a/scripts/distribute/ci_case_auto.sh +++ b/scripts/distribute/ci_case_auto.sh @@ -95,7 +95,6 @@ function llama_case_list_auto() { llama_dy2st_auto_bs4_bf16_DP1-MP1-PP4-SD2 llama_align_dygraph_dy2st_auto_bs2_bf16_DP2-MP1-PP1 llama_pir_auto_fuse_ffn_attention_qkv_MP2 - llama_convert_hybrid_ckpt_to_auto_parallel_bs2_fp32_DP2-MP1-PP1 llama_align_dygraph_dy2st_pir_auto_bs2_bf16_DP2-MP2-PP1-SP llama_align_dygraph_dy2st_pir_auto_bs2_bf16_DP2-MP2-PP2-SP llama_align_dygraph_dy2st_pir_auto_grad_merge_bs2_fp32_DP1-MP1-PP1