diff --git a/configs/13B_template.py b/configs/13B_template.py
index 26be3f71..8a4c019d 100644
--- a/configs/13B_template.py
+++ b/configs/13B_template.py
@@ -1,8 +1,7 @@
-
 DO_ALERT = False
 
-SEQ_LEN = {seq_len}
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({checkpoint})
+SEQ_LEN = 4096
+JOB_NAME = "13b_train_" + str({micro_bsz}) + "_" + str({sp}) + "_" + str({checkpoint})
 HIDDEN_SIZE = 5120
 NUM_ATTENTION_HEAD = 40
 MLP_RATIO = 8 / 3
@@ -50,9 +49,9 @@
 data = dict(
     seq_len=SEQ_LEN,
     # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
+    micro_num=1,
     # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
+    micro_bsz={micro_bsz},
     # defaults to the value of micro_num
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
@@ -126,7 +125,7 @@
 )
 
 model = dict(
-    checkpoint={checkpoint},  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    checkpoint={checkpoint},
     num_attention_heads=NUM_ATTENTION_HEAD,
     embed_split_hidden=True,
     vocab_size=VOCAB_SIZE,
diff --git a/configs/13B_train/131072_flash-attn_ckpt_False.py b/configs/13B_train/131072_flash-attn_ckpt_False.py
deleted file mode 100644
index 28d51af6..00000000
--- a/configs/13B_train/131072_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/131072_flash-attn_ckpt_True.py b/configs/13B_train/131072_flash-attn_ckpt_True.py
deleted file mode 100644
index 6d1b7ef0..00000000
--- a/configs/13B_train/131072_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/131072_flash_ckpt_False.py b/configs/13B_train/131072_flash_ckpt_False.py
deleted file mode 100644
index dd0f0e89..00000000
--- a/configs/13B_train/131072_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/131072_flash_ckpt_True.py b/configs/13B_train/131072_flash_ckpt_True.py
deleted file mode 100644
index 2b9276db..00000000
--- a/configs/13B_train/131072_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/131072_intern_ckpt_False.py b/configs/13B_train/131072_intern_ckpt_False.py
deleted file mode 100644
index 182e4ddb..00000000
--- a/configs/13B_train/131072_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/131072_intern_ckpt_True.py b/configs/13B_train/131072_intern_ckpt_True.py
deleted file mode 100644
index c23a3c10..00000000
--- a/configs/13B_train/131072_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/131072_megatron_ckpt_False.py b/configs/13B_train/131072_megatron_ckpt_False.py
deleted file mode 100644
index 935ff98d..00000000
--- a/configs/13B_train/131072_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/131072_megatron_ckpt_True.py b/configs/13B_train/131072_megatron_ckpt_True.py
deleted file mode 100644
index 441166c2..00000000
--- a/configs/13B_train/131072_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/131072_none_ckpt_False.py b/configs/13B_train/131072_none_ckpt_False.py
deleted file mode 100644
index e43d6044..00000000
--- a/configs/13B_train/131072_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/131072_none_ckpt_True.py b/configs/13B_train/131072_none_ckpt_True.py
deleted file mode 100644
index 0945dbdc..00000000
--- a/configs/13B_train/131072_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/16384_flash-attn_ckpt_False.py b/configs/13B_train/16384_flash-attn_ckpt_False.py
deleted file mode 100644
index 393e54d3..00000000
--- a/configs/13B_train/16384_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/16384_flash-attn_ckpt_True.py b/configs/13B_train/16384_flash-attn_ckpt_True.py
deleted file mode 100644
index 7f7e7ac6..00000000
--- a/configs/13B_train/16384_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/16384_flash_ckpt_False.py b/configs/13B_train/16384_flash_ckpt_False.py
deleted file mode 100644
index cadd215f..00000000
--- a/configs/13B_train/16384_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/16384_flash_ckpt_True.py b/configs/13B_train/16384_flash_ckpt_True.py
deleted file mode 100644
index c60ea730..00000000
--- a/configs/13B_train/16384_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/16384_intern_ckpt_False.py b/configs/13B_train/16384_intern_ckpt_False.py
deleted file mode 100644
index e5d6fa6b..00000000
--- a/configs/13B_train/16384_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/16384_intern_ckpt_True.py b/configs/13B_train/16384_intern_ckpt_True.py
deleted file mode 100644
index 6ac47ac2..00000000
--- a/configs/13B_train/16384_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/16384_megatron_ckpt_False.py b/configs/13B_train/16384_megatron_ckpt_False.py
deleted file mode 100644
index 24429ead..00000000
--- a/configs/13B_train/16384_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/16384_megatron_ckpt_True.py b/configs/13B_train/16384_megatron_ckpt_True.py
deleted file mode 100644
index d79c8207..00000000
--- a/configs/13B_train/16384_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/16384_none_ckpt_False.py b/configs/13B_train/16384_none_ckpt_False.py
deleted file mode 100644
index a30d713a..00000000
--- a/configs/13B_train/16384_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/16384_none_ckpt_True.py b/configs/13B_train/16384_none_ckpt_True.py
deleted file mode 100644
index 76483257..00000000
--- a/configs/13B_train/16384_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/262144_flash-attn_ckpt_False.py b/configs/13B_train/262144_flash-attn_ckpt_False.py
deleted file mode 100644
index fd0be6a7..00000000
--- a/configs/13B_train/262144_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/262144_flash_ckpt_False.py b/configs/13B_train/262144_flash_ckpt_False.py
deleted file mode 100644
index 5ca332ef..00000000
--- a/configs/13B_train/262144_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/262144_flash_ckpt_True.py b/configs/13B_train/262144_flash_ckpt_True.py
deleted file mode 100644
index f990655a..00000000
--- a/configs/13B_train/262144_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/262144_intern_ckpt_False.py b/configs/13B_train/262144_intern_ckpt_False.py
deleted file mode 100644
index 7ebcf94f..00000000
--- a/configs/13B_train/262144_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/262144_intern_ckpt_True.py b/configs/13B_train/262144_intern_ckpt_True.py
deleted file mode 100644
index e958ac06..00000000
--- a/configs/13B_train/262144_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/262144_megatron_ckpt_False.py b/configs/13B_train/262144_megatron_ckpt_False.py
deleted file mode 100644
index 31e96f78..00000000
--- a/configs/13B_train/262144_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/262144_megatron_ckpt_True.py b/configs/13B_train/262144_megatron_ckpt_True.py
deleted file mode 100644
index 2339244b..00000000
--- a/configs/13B_train/262144_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/262144_none_ckpt_False.py b/configs/13B_train/262144_none_ckpt_False.py
deleted file mode 100644
index 41d55e91..00000000
--- a/configs/13B_train/262144_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/262144_none_ckpt_True.py b/configs/13B_train/262144_none_ckpt_True.py
deleted file mode 100644
index 4f2da605..00000000
--- a/configs/13B_train/262144_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/32768_flash-attn_ckpt_False.py b/configs/13B_train/32768_flash-attn_ckpt_False.py
deleted file mode 100644
index 3eb0f493..00000000
--- a/configs/13B_train/32768_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/32768_flash-attn_ckpt_True.py b/configs/13B_train/32768_flash-attn_ckpt_True.py
deleted file mode 100644
index 26b06ef3..00000000
--- a/configs/13B_train/32768_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/32768_flash_ckpt_False.py b/configs/13B_train/32768_flash_ckpt_False.py
deleted file mode 100644
index da30a4dd..00000000
--- a/configs/13B_train/32768_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/32768_flash_ckpt_True.py b/configs/13B_train/32768_flash_ckpt_True.py
deleted file mode 100644
index 20d415a5..00000000
--- a/configs/13B_train/32768_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/32768_intern_ckpt_False.py b/configs/13B_train/32768_intern_ckpt_False.py
deleted file mode 100644
index 05ab5285..00000000
--- a/configs/13B_train/32768_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/32768_intern_ckpt_True.py b/configs/13B_train/32768_intern_ckpt_True.py
deleted file mode 100644
index 273a812d..00000000
--- a/configs/13B_train/32768_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/32768_megatron_ckpt_False.py b/configs/13B_train/32768_megatron_ckpt_False.py
deleted file mode 100644
index c8db542d..00000000
--- a/configs/13B_train/32768_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/32768_megatron_ckpt_True.py b/configs/13B_train/32768_megatron_ckpt_True.py
deleted file mode 100644
index 9ff56012..00000000
--- a/configs/13B_train/32768_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/32768_none_ckpt_False.py b/configs/13B_train/32768_none_ckpt_False.py
deleted file mode 100644
index a02e0711..00000000
--- a/configs/13B_train/32768_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/32768_none_ckpt_True.py b/configs/13B_train/32768_none_ckpt_True.py
deleted file mode 100644
index b9b17e3c..00000000
--- a/configs/13B_train/32768_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/4096_flash-attn_ckpt_False.py b/configs/13B_train/4096_flash-attn_ckpt_False.py
deleted file mode 100644
index 8e4459ea..00000000
--- a/configs/13B_train/4096_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/4096_flash-attn_ckpt_True.py b/configs/13B_train/4096_flash-attn_ckpt_True.py
deleted file mode 100644
index a8f5e39b..00000000
--- a/configs/13B_train/4096_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/4096_flash_ckpt_False.py b/configs/13B_train/4096_flash_ckpt_False.py
deleted file mode 100644
index 517b46e4..00000000
--- a/configs/13B_train/4096_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/4096_flash_ckpt_True.py b/configs/13B_train/4096_flash_ckpt_True.py
deleted file mode 100644
index eacfcdfd..00000000
--- a/configs/13B_train/4096_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/4096_intern_ckpt_False.py b/configs/13B_train/4096_intern_ckpt_False.py
deleted file mode 100644
index 5ecf2d66..00000000
--- a/configs/13B_train/4096_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/4096_intern_ckpt_True.py b/configs/13B_train/4096_intern_ckpt_True.py
deleted file mode 100644
index b70acb01..00000000
--- a/configs/13B_train/4096_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/4096_megatron_ckpt_False.py b/configs/13B_train/4096_megatron_ckpt_False.py
deleted file mode 100644
index 2e847a64..00000000
--- a/configs/13B_train/4096_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/4096_megatron_ckpt_True.py b/configs/13B_train/4096_megatron_ckpt_True.py
deleted file mode 100644
index d8ba2c57..00000000
--- a/configs/13B_train/4096_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/4096_none_ckpt_False.py b/configs/13B_train/4096_none_ckpt_False.py
deleted file mode 100644
index f8bbdfc5..00000000
--- a/configs/13B_train/4096_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/4096_none_ckpt_True.py b/configs/13B_train/4096_none_ckpt_True.py
deleted file mode 100644
index d8f8ec7e..00000000
--- a/configs/13B_train/4096_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/65536_flash-attn_ckpt_False.py b/configs/13B_train/65536_flash-attn_ckpt_False.py
deleted file mode 100644
index 09367f5a..00000000
--- a/configs/13B_train/65536_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/65536_flash-attn_ckpt_True.py b/configs/13B_train/65536_flash-attn_ckpt_True.py
deleted file mode 100644
index dc283a92..00000000
--- a/configs/13B_train/65536_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/65536_flash_ckpt_False.py b/configs/13B_train/65536_flash_ckpt_False.py
deleted file mode 100644
index 482d5114..00000000
--- a/configs/13B_train/65536_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/65536_flash_ckpt_True.py b/configs/13B_train/65536_flash_ckpt_True.py
deleted file mode 100644
index 66051f83..00000000
--- a/configs/13B_train/65536_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/65536_intern_ckpt_False.py b/configs/13B_train/65536_intern_ckpt_False.py
deleted file mode 100644
index f829652a..00000000
--- a/configs/13B_train/65536_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/65536_intern_ckpt_True.py b/configs/13B_train/65536_intern_ckpt_True.py
deleted file mode 100644
index 4e94d0e3..00000000
--- a/configs/13B_train/65536_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/65536_megatron_ckpt_False.py b/configs/13B_train/65536_megatron_ckpt_False.py
deleted file mode 100644
index a9293334..00000000
--- a/configs/13B_train/65536_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/65536_megatron_ckpt_True.py b/configs/13B_train/65536_megatron_ckpt_True.py
deleted file mode 100644
index 845e32bc..00000000
--- a/configs/13B_train/65536_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/65536_none_ckpt_False.py b/configs/13B_train/65536_none_ckpt_False.py
deleted file mode 100644
index 52ce3c52..00000000
--- a/configs/13B_train/65536_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/65536_none_ckpt_True.py b/configs/13B_train/65536_none_ckpt_True.py
deleted file mode 100644
index de5532e1..00000000
--- a/configs/13B_train/65536_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/8192_flash-attn_ckpt_False.py b/configs/13B_train/8192_flash-attn_ckpt_False.py
deleted file mode 100644
index 3324c290..00000000
--- a/configs/13B_train/8192_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/8192_flash-attn_ckpt_True.py b/configs/13B_train/8192_flash-attn_ckpt_True.py
deleted file mode 100644
index 317e0f32..00000000
--- a/configs/13B_train/8192_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/8192_flash_ckpt_False.py b/configs/13B_train/8192_flash_ckpt_False.py
deleted file mode 100644
index d645dc1b..00000000
--- a/configs/13B_train/8192_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/8192_flash_ckpt_True.py b/configs/13B_train/8192_flash_ckpt_True.py
deleted file mode 100644
index 425859c0..00000000
--- a/configs/13B_train/8192_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "13b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/8192_intern_ckpt_False.py b/configs/13B_train/8192_intern_ckpt_False.py
deleted file mode 100644
index 0b4fb8a2..00000000
--- a/configs/13B_train/8192_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/8192_intern_ckpt_True.py b/configs/13B_train/8192_intern_ckpt_True.py
deleted file mode 100644
index b42cb769..00000000
--- a/configs/13B_train/8192_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/8192_megatron_ckpt_False.py b/configs/13B_train/8192_megatron_ckpt_False.py
deleted file mode 100644
index e2191937..00000000
--- a/configs/13B_train/8192_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/8192_megatron_ckpt_True.py b/configs/13B_train/8192_megatron_ckpt_True.py
deleted file mode 100644
index 5123c412..00000000
--- a/configs/13B_train/8192_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/8192_none_ckpt_False.py b/configs/13B_train/8192_none_ckpt_False.py
deleted file mode 100644
index c9d9c050..00000000
--- a/configs/13B_train/8192_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/13B_train/8192_none_ckpt_True.py b/configs/13B_train/8192_none_ckpt_True.py
deleted file mode 100644
index 182ec21f..00000000
--- a/configs/13B_train/8192_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 5120
-NUM_ATTENTION_HEAD = 40
-MLP_RATIO = 8 / 3
-NUM_LAYER = 40
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_template.py b/configs/30B_template.py
index 7a32015e..29dc37dd 100644
--- a/configs/30B_template.py
+++ b/configs/30B_template.py
@@ -1,8 +1,7 @@
-
 DO_ALERT = False
 
-SEQ_LEN = {seq_len}
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({checkpoint})
+SEQ_LEN = 4096
+JOB_NAME = "7b_train_" + str({micro_bsz}) + "_" + str({sp}) + "_" + str({checkpoint})
 HIDDEN_SIZE = 6144
 NUM_ATTENTION_HEAD = 48
 MLP_RATIO = 8 / 3
@@ -50,9 +49,9 @@
 data = dict(
     seq_len=SEQ_LEN,
     # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
+    micro_num=1,
     # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
+    micro_bsz={micro_bsz},
     # defaults to the value of micro_num
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
@@ -126,7 +125,7 @@
 )
 
 model = dict(
-    checkpoint={checkpoint},  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    checkpoint={checkpoint},
     num_attention_heads=NUM_ATTENTION_HEAD,
     embed_split_hidden=True,
     vocab_size=VOCAB_SIZE,
diff --git a/configs/30B_train/131072_flash_ckpt_False.py b/configs/30B_train/131072_flash_ckpt_False.py
deleted file mode 100644
index 3af48f3e..00000000
--- a/configs/30B_train/131072_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/131072_flash_ckpt_True.py b/configs/30B_train/131072_flash_ckpt_True.py
deleted file mode 100644
index 4bd249bc..00000000
--- a/configs/30B_train/131072_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/131072_intern_ckpt_False.py b/configs/30B_train/131072_intern_ckpt_False.py
deleted file mode 100644
index 77b176d2..00000000
--- a/configs/30B_train/131072_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/131072_intern_ckpt_True.py b/configs/30B_train/131072_intern_ckpt_True.py
deleted file mode 100644
index 38a1db3b..00000000
--- a/configs/30B_train/131072_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/131072_megatron_ckpt_False.py b/configs/30B_train/131072_megatron_ckpt_False.py
deleted file mode 100644
index 49879303..00000000
--- a/configs/30B_train/131072_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/131072_megatron_ckpt_True.py b/configs/30B_train/131072_megatron_ckpt_True.py
deleted file mode 100644
index d911d381..00000000
--- a/configs/30B_train/131072_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/131072_none_ckpt_False.py b/configs/30B_train/131072_none_ckpt_False.py
deleted file mode 100644
index 78b3c9a8..00000000
--- a/configs/30B_train/131072_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/131072_none_ckpt_True.py b/configs/30B_train/131072_none_ckpt_True.py
deleted file mode 100644
index 941279e7..00000000
--- a/configs/30B_train/131072_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/16384_flash_ckpt_False.py b/configs/30B_train/16384_flash_ckpt_False.py
deleted file mode 100644
index 779a10bc..00000000
--- a/configs/30B_train/16384_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/16384_flash_ckpt_True.py b/configs/30B_train/16384_flash_ckpt_True.py
deleted file mode 100644
index 0498e2c4..00000000
--- a/configs/30B_train/16384_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/16384_intern_ckpt_False.py b/configs/30B_train/16384_intern_ckpt_False.py
deleted file mode 100644
index 309a33f0..00000000
--- a/configs/30B_train/16384_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/16384_intern_ckpt_True.py b/configs/30B_train/16384_intern_ckpt_True.py
deleted file mode 100644
index 23c977a5..00000000
--- a/configs/30B_train/16384_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/16384_megatron_ckpt_False.py b/configs/30B_train/16384_megatron_ckpt_False.py
deleted file mode 100644
index 8576aa76..00000000
--- a/configs/30B_train/16384_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/16384_megatron_ckpt_True.py b/configs/30B_train/16384_megatron_ckpt_True.py
deleted file mode 100644
index 460aba3b..00000000
--- a/configs/30B_train/16384_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/16384_none_ckpt_False.py b/configs/30B_train/16384_none_ckpt_False.py
deleted file mode 100644
index 4ca50666..00000000
--- a/configs/30B_train/16384_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/16384_none_ckpt_True.py b/configs/30B_train/16384_none_ckpt_True.py
deleted file mode 100644
index c7987e0d..00000000
--- a/configs/30B_train/16384_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/262144_flash_ckpt_False.py b/configs/30B_train/262144_flash_ckpt_False.py
deleted file mode 100644
index 10d71d9c..00000000
--- a/configs/30B_train/262144_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/262144_flash_ckpt_True.py b/configs/30B_train/262144_flash_ckpt_True.py
deleted file mode 100644
index a1990dbb..00000000
--- a/configs/30B_train/262144_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/262144_intern_ckpt_False.py b/configs/30B_train/262144_intern_ckpt_False.py
deleted file mode 100644
index f8ec6a2f..00000000
--- a/configs/30B_train/262144_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/262144_intern_ckpt_True.py b/configs/30B_train/262144_intern_ckpt_True.py
deleted file mode 100644
index c5afa46b..00000000
--- a/configs/30B_train/262144_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/262144_megatron_ckpt_False.py b/configs/30B_train/262144_megatron_ckpt_False.py
deleted file mode 100644
index 412da179..00000000
--- a/configs/30B_train/262144_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/262144_megatron_ckpt_True.py b/configs/30B_train/262144_megatron_ckpt_True.py
deleted file mode 100644
index 79affb19..00000000
--- a/configs/30B_train/262144_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/262144_none_ckpt_False.py b/configs/30B_train/262144_none_ckpt_False.py
deleted file mode 100644
index e6fbe1eb..00000000
--- a/configs/30B_train/262144_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/262144_none_ckpt_True.py b/configs/30B_train/262144_none_ckpt_True.py
deleted file mode 100644
index d507c30b..00000000
--- a/configs/30B_train/262144_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/32768_flash_ckpt_False.py b/configs/30B_train/32768_flash_ckpt_False.py
deleted file mode 100644
index 6bac5b31..00000000
--- a/configs/30B_train/32768_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/32768_flash_ckpt_True.py b/configs/30B_train/32768_flash_ckpt_True.py
deleted file mode 100644
index f21c9983..00000000
--- a/configs/30B_train/32768_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/32768_intern_ckpt_False.py b/configs/30B_train/32768_intern_ckpt_False.py
deleted file mode 100644
index 79728d64..00000000
--- a/configs/30B_train/32768_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/32768_intern_ckpt_True.py b/configs/30B_train/32768_intern_ckpt_True.py
deleted file mode 100644
index 6dc24c30..00000000
--- a/configs/30B_train/32768_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/32768_megatron_ckpt_False.py b/configs/30B_train/32768_megatron_ckpt_False.py
deleted file mode 100644
index 37fd0986..00000000
--- a/configs/30B_train/32768_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/32768_megatron_ckpt_True.py b/configs/30B_train/32768_megatron_ckpt_True.py
deleted file mode 100644
index 986b27dd..00000000
--- a/configs/30B_train/32768_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/32768_none_ckpt_False.py b/configs/30B_train/32768_none_ckpt_False.py
deleted file mode 100644
index 9c6ca879..00000000
--- a/configs/30B_train/32768_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/32768_none_ckpt_True.py b/configs/30B_train/32768_none_ckpt_True.py
deleted file mode 100644
index d4ab7f2d..00000000
--- a/configs/30B_train/32768_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/4096_flash_ckpt_False.py b/configs/30B_train/4096_flash_ckpt_False.py
deleted file mode 100644
index 3dd8be56..00000000
--- a/configs/30B_train/4096_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/4096_flash_ckpt_True.py b/configs/30B_train/4096_flash_ckpt_True.py
deleted file mode 100644
index 73150acf..00000000
--- a/configs/30B_train/4096_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/4096_intern_ckpt_False.py b/configs/30B_train/4096_intern_ckpt_False.py
deleted file mode 100644
index cff6c5b6..00000000
--- a/configs/30B_train/4096_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/4096_intern_ckpt_True.py b/configs/30B_train/4096_intern_ckpt_True.py
deleted file mode 100644
index 1fb64257..00000000
--- a/configs/30B_train/4096_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/4096_megatron_ckpt_False.py b/configs/30B_train/4096_megatron_ckpt_False.py
deleted file mode 100644
index 79f718d0..00000000
--- a/configs/30B_train/4096_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/4096_megatron_ckpt_True.py b/configs/30B_train/4096_megatron_ckpt_True.py
deleted file mode 100644
index 502ae7f7..00000000
--- a/configs/30B_train/4096_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/4096_none_ckpt_False.py b/configs/30B_train/4096_none_ckpt_False.py
deleted file mode 100644
index 981a0f23..00000000
--- a/configs/30B_train/4096_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/4096_none_ckpt_True.py b/configs/30B_train/4096_none_ckpt_True.py
deleted file mode 100644
index dddea663..00000000
--- a/configs/30B_train/4096_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/65536_flash_ckpt_False.py b/configs/30B_train/65536_flash_ckpt_False.py
deleted file mode 100644
index babebd95..00000000
--- a/configs/30B_train/65536_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/65536_flash_ckpt_True.py b/configs/30B_train/65536_flash_ckpt_True.py
deleted file mode 100644
index 064250e7..00000000
--- a/configs/30B_train/65536_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/65536_intern_ckpt_False.py b/configs/30B_train/65536_intern_ckpt_False.py
deleted file mode 100644
index 64165f44..00000000
--- a/configs/30B_train/65536_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/65536_intern_ckpt_True.py b/configs/30B_train/65536_intern_ckpt_True.py
deleted file mode 100644
index 78b66213..00000000
--- a/configs/30B_train/65536_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/65536_megatron_ckpt_False.py b/configs/30B_train/65536_megatron_ckpt_False.py
deleted file mode 100644
index e8c09548..00000000
--- a/configs/30B_train/65536_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/65536_megatron_ckpt_True.py b/configs/30B_train/65536_megatron_ckpt_True.py
deleted file mode 100644
index d3b64c41..00000000
--- a/configs/30B_train/65536_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/65536_none_ckpt_False.py b/configs/30B_train/65536_none_ckpt_False.py
deleted file mode 100644
index ee4c7fb5..00000000
--- a/configs/30B_train/65536_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/65536_none_ckpt_True.py b/configs/30B_train/65536_none_ckpt_True.py
deleted file mode 100644
index 2e84144c..00000000
--- a/configs/30B_train/65536_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/8192_flash_ckpt_False.py b/configs/30B_train/8192_flash_ckpt_False.py
deleted file mode 100644
index b9eb6e65..00000000
--- a/configs/30B_train/8192_flash_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/8192_flash_ckpt_True.py b/configs/30B_train/8192_flash_ckpt_True.py
deleted file mode 100644
index c0dd5175..00000000
--- a/configs/30B_train/8192_flash_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/8192_intern_ckpt_False.py b/configs/30B_train/8192_intern_ckpt_False.py
deleted file mode 100644
index d915b6b8..00000000
--- a/configs/30B_train/8192_intern_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/8192_intern_ckpt_True.py b/configs/30B_train/8192_intern_ckpt_True.py
deleted file mode 100644
index a71693a1..00000000
--- a/configs/30B_train/8192_intern_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/8192_megatron_ckpt_False.py b/configs/30B_train/8192_megatron_ckpt_False.py
deleted file mode 100644
index dcacb9e5..00000000
--- a/configs/30B_train/8192_megatron_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/8192_megatron_ckpt_True.py b/configs/30B_train/8192_megatron_ckpt_True.py
deleted file mode 100644
index b6e4ba24..00000000
--- a/configs/30B_train/8192_megatron_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/8192_none_ckpt_False.py b/configs/30B_train/8192_none_ckpt_False.py
deleted file mode 100644
index ce790dfa..00000000
--- a/configs/30B_train/8192_none_ckpt_False.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/30B_train/8192_none_ckpt_True.py b/configs/30B_train/8192_none_ckpt_True.py
deleted file mode 100644
index e6afcd4e..00000000
--- a/configs/30B_train/8192_none_ckpt_True.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN)
-HIDDEN_SIZE = 6144
-NUM_ATTENTION_HEAD = 48
-MLP_RATIO = 8 / 3
-NUM_LAYER = 60
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=False,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp',
-        the sequence_parallel should be True.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-sequence parallel (bool): enable/disable sequence parallel, defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_template.py b/configs/7B_template.py
index b9f76a51..f755520f 100644
--- a/configs/7B_template.py
+++ b/configs/7B_template.py
@@ -1,8 +1,8 @@
 # JOB_NAME = "7b_train"
 DO_ALERT = False
 
-SEQ_LEN = {seq_len}
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({checkpoint})
+SEQ_LEN = 4096
+JOB_NAME = "7b_train_" + str({micro_bsz}) + "_" + str({sp}) + "_" + str({checkpoint})
 HIDDEN_SIZE = 4096
 NUM_ATTENTION_HEAD = 32
 MLP_RATIO = 8 / 3
@@ -52,7 +52,7 @@
     # micro_num means the number of micro_batch contained in one gradient update
     micro_num=1,
     # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
+    micro_bsz={micro_bsz},
     # defaults to the value of micro_num
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
@@ -126,7 +126,7 @@
 )
 
 model = dict(
-    checkpoint={checkpoint},  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    checkpoint={checkpoint},
     num_attention_heads=NUM_ATTENTION_HEAD,
     embed_split_hidden=True,
     vocab_size=VOCAB_SIZE,
diff --git a/configs/7B_train/131072_flash-attn_ckpt_False.py b/configs/7B_train/131072_flash-attn_ckpt_False.py
deleted file mode 100644
index 047fb372..00000000
--- a/configs/7B_train/131072_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/131072_flash-attn_ckpt_True.py b/configs/7B_train/131072_flash-attn_ckpt_True.py
deleted file mode 100644
index 763627d6..00000000
--- a/configs/7B_train/131072_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/131072_flash_ckpt_False.py b/configs/7B_train/131072_flash_ckpt_False.py
deleted file mode 100644
index 4307e9d1..00000000
--- a/configs/7B_train/131072_flash_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/131072_flash_ckpt_True.py b/configs/7B_train/131072_flash_ckpt_True.py
deleted file mode 100644
index c110b256..00000000
--- a/configs/7B_train/131072_flash_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/131072_intern_ckpt_False.py b/configs/7B_train/131072_intern_ckpt_False.py
deleted file mode 100644
index 1d728be7..00000000
--- a/configs/7B_train/131072_intern_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/131072_intern_ckpt_True.py b/configs/7B_train/131072_intern_ckpt_True.py
deleted file mode 100644
index 45d4aa01..00000000
--- a/configs/7B_train/131072_intern_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/131072_megatron_ckpt_False.py b/configs/7B_train/131072_megatron_ckpt_False.py
deleted file mode 100644
index 0bd98459..00000000
--- a/configs/7B_train/131072_megatron_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/131072_megatron_ckpt_True.py b/configs/7B_train/131072_megatron_ckpt_True.py
deleted file mode 100644
index 9200afbe..00000000
--- a/configs/7B_train/131072_megatron_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/131072_none_ckpt_False.py b/configs/7B_train/131072_none_ckpt_False.py
deleted file mode 100644
index 16059fb1..00000000
--- a/configs/7B_train/131072_none_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/131072_none_ckpt_True.py b/configs/7B_train/131072_none_ckpt_True.py
deleted file mode 100644
index 35b3f08e..00000000
--- a/configs/7B_train/131072_none_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 131072
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/16384_flash-attn_ckpt_False.py b/configs/7B_train/16384_flash-attn_ckpt_False.py
deleted file mode 100644
index 53a64b99..00000000
--- a/configs/7B_train/16384_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/16384_flash-attn_ckpt_True.py b/configs/7B_train/16384_flash-attn_ckpt_True.py
deleted file mode 100644
index cdb051e5..00000000
--- a/configs/7B_train/16384_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/16384_flash_ckpt_False.py b/configs/7B_train/16384_flash_ckpt_False.py
deleted file mode 100644
index 41b39515..00000000
--- a/configs/7B_train/16384_flash_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/16384_flash_ckpt_True.py b/configs/7B_train/16384_flash_ckpt_True.py
deleted file mode 100644
index ca2c7f06..00000000
--- a/configs/7B_train/16384_flash_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/16384_intern_ckpt_False.py b/configs/7B_train/16384_intern_ckpt_False.py
deleted file mode 100644
index 93abb682..00000000
--- a/configs/7B_train/16384_intern_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/16384_intern_ckpt_True.py b/configs/7B_train/16384_intern_ckpt_True.py
deleted file mode 100644
index af9d9945..00000000
--- a/configs/7B_train/16384_intern_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/16384_megatron_ckpt_False.py b/configs/7B_train/16384_megatron_ckpt_False.py
deleted file mode 100644
index d2c58d3a..00000000
--- a/configs/7B_train/16384_megatron_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/16384_megatron_ckpt_True.py b/configs/7B_train/16384_megatron_ckpt_True.py
deleted file mode 100644
index 6e372b8c..00000000
--- a/configs/7B_train/16384_megatron_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/16384_none_ckpt_False.py b/configs/7B_train/16384_none_ckpt_False.py
deleted file mode 100644
index 0fd65900..00000000
--- a/configs/7B_train/16384_none_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/16384_none_ckpt_True.py b/configs/7B_train/16384_none_ckpt_True.py
deleted file mode 100644
index 6ea5e1a9..00000000
--- a/configs/7B_train/16384_none_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 16384
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/262144_flash-attn_ckpt_False.py b/configs/7B_train/262144_flash-attn_ckpt_False.py
deleted file mode 100644
index 6dad9730..00000000
--- a/configs/7B_train/262144_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/262144_flash-attn_ckpt_True.py b/configs/7B_train/262144_flash-attn_ckpt_True.py
deleted file mode 100644
index cacd9737..00000000
--- a/configs/7B_train/262144_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/262144_flash_ckpt_False.py b/configs/7B_train/262144_flash_ckpt_False.py
deleted file mode 100644
index 0e9b0173..00000000
--- a/configs/7B_train/262144_flash_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/262144_flash_ckpt_True.py b/configs/7B_train/262144_flash_ckpt_True.py
deleted file mode 100644
index ddacc8df..00000000
--- a/configs/7B_train/262144_flash_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/262144_intern_ckpt_False.py b/configs/7B_train/262144_intern_ckpt_False.py
deleted file mode 100644
index e5cf7694..00000000
--- a/configs/7B_train/262144_intern_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/262144_intern_ckpt_True.py b/configs/7B_train/262144_intern_ckpt_True.py
deleted file mode 100644
index 76f9386a..00000000
--- a/configs/7B_train/262144_intern_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/262144_megatron_ckpt_False.py b/configs/7B_train/262144_megatron_ckpt_False.py
deleted file mode 100644
index b929f9a6..00000000
--- a/configs/7B_train/262144_megatron_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/262144_megatron_ckpt_True.py b/configs/7B_train/262144_megatron_ckpt_True.py
deleted file mode 100644
index 1655631c..00000000
--- a/configs/7B_train/262144_megatron_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/262144_none_ckpt_False.py b/configs/7B_train/262144_none_ckpt_False.py
deleted file mode 100644
index 85512f07..00000000
--- a/configs/7B_train/262144_none_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/262144_none_ckpt_True.py b/configs/7B_train/262144_none_ckpt_True.py
deleted file mode 100644
index fef559bd..00000000
--- a/configs/7B_train/262144_none_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 262144
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/32768_flash-attn_ckpt_False.py b/configs/7B_train/32768_flash-attn_ckpt_False.py
deleted file mode 100644
index f2664be8..00000000
--- a/configs/7B_train/32768_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/32768_flash-attn_ckpt_True.py b/configs/7B_train/32768_flash-attn_ckpt_True.py
deleted file mode 100644
index 232b5904..00000000
--- a/configs/7B_train/32768_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/32768_flash_ckpt_False.py b/configs/7B_train/32768_flash_ckpt_False.py
deleted file mode 100644
index 878b9ac1..00000000
--- a/configs/7B_train/32768_flash_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/32768_flash_ckpt_True.py b/configs/7B_train/32768_flash_ckpt_True.py
deleted file mode 100644
index 27cffd02..00000000
--- a/configs/7B_train/32768_flash_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/32768_intern_ckpt_False.py b/configs/7B_train/32768_intern_ckpt_False.py
deleted file mode 100644
index fcf84197..00000000
--- a/configs/7B_train/32768_intern_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/32768_intern_ckpt_True.py b/configs/7B_train/32768_intern_ckpt_True.py
deleted file mode 100644
index aec2b68b..00000000
--- a/configs/7B_train/32768_intern_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/32768_megatron_ckpt_False.py b/configs/7B_train/32768_megatron_ckpt_False.py
deleted file mode 100644
index 64caeeb5..00000000
--- a/configs/7B_train/32768_megatron_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/32768_megatron_ckpt_True.py b/configs/7B_train/32768_megatron_ckpt_True.py
deleted file mode 100644
index a736e7d0..00000000
--- a/configs/7B_train/32768_megatron_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/32768_none_ckpt_False.py b/configs/7B_train/32768_none_ckpt_False.py
deleted file mode 100644
index 3a31776e..00000000
--- a/configs/7B_train/32768_none_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/32768_none_ckpt_True.py b/configs/7B_train/32768_none_ckpt_True.py
deleted file mode 100644
index 4ac09249..00000000
--- a/configs/7B_train/32768_none_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 32768
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/4096_flash-attn_ckpt_False.py b/configs/7B_train/4096_flash-attn_ckpt_False.py
deleted file mode 100644
index b3de8990..00000000
--- a/configs/7B_train/4096_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/4096_flash-attn_ckpt_True.py b/configs/7B_train/4096_flash-attn_ckpt_True.py
deleted file mode 100644
index b44b103f..00000000
--- a/configs/7B_train/4096_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/4096_flash_ckpt_False.py b/configs/7B_train/4096_flash_ckpt_False.py
deleted file mode 100644
index 8ac542d6..00000000
--- a/configs/7B_train/4096_flash_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/4096_flash_ckpt_True.py b/configs/7B_train/4096_flash_ckpt_True.py
deleted file mode 100644
index ec477f68..00000000
--- a/configs/7B_train/4096_flash_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/4096_intern_ckpt_False.py b/configs/7B_train/4096_intern_ckpt_False.py
deleted file mode 100644
index f16f95ad..00000000
--- a/configs/7B_train/4096_intern_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/4096_intern_ckpt_True.py b/configs/7B_train/4096_intern_ckpt_True.py
deleted file mode 100644
index 90fed7c8..00000000
--- a/configs/7B_train/4096_intern_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/4096_megatron_ckpt_False.py b/configs/7B_train/4096_megatron_ckpt_False.py
deleted file mode 100644
index ca41fa28..00000000
--- a/configs/7B_train/4096_megatron_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/4096_megatron_ckpt_True.py b/configs/7B_train/4096_megatron_ckpt_True.py
deleted file mode 100644
index 45183156..00000000
--- a/configs/7B_train/4096_megatron_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/4096_none_ckpt_False.py b/configs/7B_train/4096_none_ckpt_False.py
deleted file mode 100644
index c81bb5b9..00000000
--- a/configs/7B_train/4096_none_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/4096_none_ckpt_True.py b/configs/7B_train/4096_none_ckpt_True.py
deleted file mode 100644
index a25d222f..00000000
--- a/configs/7B_train/4096_none_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/65536_flash-attn_ckpt_False.py b/configs/7B_train/65536_flash-attn_ckpt_False.py
deleted file mode 100644
index 3d5a81eb..00000000
--- a/configs/7B_train/65536_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/65536_flash-attn_ckpt_True.py b/configs/7B_train/65536_flash-attn_ckpt_True.py
deleted file mode 100644
index c6982c98..00000000
--- a/configs/7B_train/65536_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/65536_flash_ckpt_False.py b/configs/7B_train/65536_flash_ckpt_False.py
deleted file mode 100644
index 0cfea813..00000000
--- a/configs/7B_train/65536_flash_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/65536_flash_ckpt_True.py b/configs/7B_train/65536_flash_ckpt_True.py
deleted file mode 100644
index abdeb49d..00000000
--- a/configs/7B_train/65536_flash_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/65536_intern_ckpt_False.py b/configs/7B_train/65536_intern_ckpt_False.py
deleted file mode 100644
index 2e0b27e1..00000000
--- a/configs/7B_train/65536_intern_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/65536_intern_ckpt_True.py b/configs/7B_train/65536_intern_ckpt_True.py
deleted file mode 100644
index d1a8de7c..00000000
--- a/configs/7B_train/65536_intern_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/65536_megatron_ckpt_False.py b/configs/7B_train/65536_megatron_ckpt_False.py
deleted file mode 100644
index 7de7b92d..00000000
--- a/configs/7B_train/65536_megatron_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/65536_megatron_ckpt_True.py b/configs/7B_train/65536_megatron_ckpt_True.py
deleted file mode 100644
index b339c833..00000000
--- a/configs/7B_train/65536_megatron_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/65536_none_ckpt_False.py b/configs/7B_train/65536_none_ckpt_False.py
deleted file mode 100644
index b8c44769..00000000
--- a/configs/7B_train/65536_none_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/65536_none_ckpt_True.py b/configs/7B_train/65536_none_ckpt_True.py
deleted file mode 100644
index b907e437..00000000
--- a/configs/7B_train/65536_none_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 65536
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/8192_flash-attn_ckpt_False.py b/configs/7B_train/8192_flash-attn_ckpt_False.py
deleted file mode 100644
index d0ddd438..00000000
--- a/configs/7B_train/8192_flash-attn_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/8192_flash-attn_ckpt_True.py b/configs/7B_train/8192_flash-attn_ckpt_True.py
deleted file mode 100644
index d9e5b2f9..00000000
--- a/configs/7B_train/8192_flash-attn_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash-attn", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/8192_flash_ckpt_False.py b/configs/7B_train/8192_flash_ckpt_False.py
deleted file mode 100644
index 69546d11..00000000
--- a/configs/7B_train/8192_flash_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/8192_flash_ckpt_True.py b/configs/7B_train/8192_flash_ckpt_True.py
deleted file mode 100644
index 4c7f9864..00000000
--- a/configs/7B_train/8192_flash_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="flash", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/8192_intern_ckpt_False.py b/configs/7B_train/8192_intern_ckpt_False.py
deleted file mode 100644
index 9694ad81..00000000
--- a/configs/7B_train/8192_intern_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/8192_intern_ckpt_True.py b/configs/7B_train/8192_intern_ckpt_True.py
deleted file mode 100644
index 99a0fc18..00000000
--- a/configs/7B_train/8192_intern_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="intern", intern_overlap=True),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/8192_megatron_ckpt_False.py b/configs/7B_train/8192_megatron_ckpt_False.py
deleted file mode 100644
index f18ee730..00000000
--- a/configs/7B_train/8192_megatron_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/8192_megatron_ckpt_True.py b/configs/7B_train/8192_megatron_ckpt_True.py
deleted file mode 100644
index 1db58412..00000000
--- a/configs/7B_train/8192_megatron_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="megatron", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/8192_none_ckpt_False.py b/configs/7B_train/8192_none_ckpt_False.py
deleted file mode 100644
index 95d686bb..00000000
--- a/configs/7B_train/8192_none_ckpt_False.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/7B_train/8192_none_ckpt_True.py b/configs/7B_train/8192_none_ckpt_True.py
deleted file mode 100644
index a63b6f20..00000000
--- a/configs/7B_train/8192_none_ckpt_True.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# JOB_NAME = "7b_train"
-DO_ALERT = False
-
-SEQ_LEN = 8192
-JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True)
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 32
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
-    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
-    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
-    # with an automatic restart mechanism upon training reboot.
-    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
-    # path specified in `load_ckpt_info` by default.
-    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
-    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/path/to/dataset"
-VALID_FOLDER = "/path/to/dataset"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=1,
-    # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50,
-    pack_sample_into_one=True,
-    total_steps=20,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    # train_folder=TRAIN_FOLDER,
-    # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=100,
-    diag_outlier_ratio=1.1,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=False,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=True,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-)
-"""
-zero1 parallel (dict):
-    1. size: int
-        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
-            so parameters will be divided within the range of dp.
-        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
-tensor parallel (dict):
-    1. size: int, the size of tensor parallel.
-    2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'],
-        defaults to 'none', means the sequence parallel will be disabled.
-    3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp,
-        defaults to False.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
-        defaults to False.
-"""
-parallel = dict(
-    zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
-    pipeline=dict(size=1, interleaved_overlap=True),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
-
-monitor = dict(
-    # feishu alert configs
-    alert=dict(
-        enable_feishu_alert=DO_ALERT,
-        feishu_alert_address=None,  # feishu webhook to send alert message
-        light_monitor_address=None,  # light_monitor address to send heartbeat
-    ),
-)
diff --git a/configs/generate.py b/configs/generate.py
index 6a58f098..2b7d9ce5 100644
--- a/configs/generate.py
+++ b/configs/generate.py
@@ -1,17 +1,16 @@
-import os
 import copy
+import os
 import subprocess
 
 name = "./configs/"
 root_names = ["7B_train_", "13B_train_", "30B_train_"]
 model_size = ["7B", "13B", "30B"]
-seq_length = [4096, 8192, 16384, 32768, 65536, 131072, 262144]
+micro_bsz = [1, 2, 4, 8, 16, 32, 64]
 sp = ["none", "megatron", "flash-attn", "intern"]
 intern_overlap = [False, False, False, True]
 checkpoint = [False, True]
 
 for idx, root_name in enumerate(root_names):
-
     # 指定要创建的文件夹路径
     folder_path = name + root_name[:-1]
 
@@ -24,21 +23,21 @@
     with open(file_name, "r") as f:
         lines = f.readlines()
         origin_line = "".join(lines)
-        for seq in seq_length:
+        for mb in micro_bsz:
             for i, sp_mode in enumerate(sp):
                 for ckpt in checkpoint:
                     line = copy.copy(origin_line)
-                    line = line.replace("{seq_len}", str(seq))
-                    line = line.replace("{sp}", f"\"{sp_mode}\"")
+                    line = line.replace("{micro_bsz}", str(mb))
+                    line = line.replace("{sp}", f'"{sp_mode}"')
                     line = line.replace("{intern_overlap}", str(intern_overlap[i]))
                     line = line.replace("{checkpoint}", str(ckpt))
-                    output_file_name = str(seq) + "_" + str(sp_mode) + "_ckpt_" + str(ckpt) + ".py"
+                    output_file_name = str(mb) + "_" + str(sp_mode) + "_ckpt_" + str(ckpt) + ".py"
                     write_file = folder_path + "/" + output_file_name
                     with open(write_file, "w") as file:
                         file.write(line)
-                        
+
                     log_name = root_name + "_" + output_file_name[:-3]
-                    
+
                     command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=10 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log"
-                    process = subprocess.Popen(command, shell=True, executable='/bin/bash')
-                    process.wait() 
\ No newline at end of file
+                    process = subprocess.Popen(command, shell=True, executable="/bin/bash")
+                    process.wait()