diff --git a/.gitignore b/.gitignore index 04367e3d..9bdc7ec7 100644 --- a/.gitignore +++ b/.gitignore @@ -149,5 +149,9 @@ memory_trace 13b_train*/ 30b_train*/ fstp_logs/ +configs/7B_train/* +configs/13B_train/* +configs/30B_train/* + atb pip diff --git a/configs/13B_template.py b/configs/13B_template.py index 26be3f71..e0e016cc 100644 --- a/configs/13B_template.py +++ b/configs/13B_template.py @@ -2,7 +2,7 @@ DO_ALERT = False SEQ_LEN = {seq_len} -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({checkpoint}) +JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint}) HIDDEN_SIZE = 5120 NUM_ATTENTION_HEAD = 40 MLP_RATIO = 8 / 3 @@ -50,9 +50,9 @@ data = dict( seq_len=SEQ_LEN, # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, + micro_num=1, # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, + micro_bsz=1, # defaults to the value of micro_num valid_micro_num=4, # defaults to 0, means disable evaluate @@ -91,7 +91,7 @@ hybrid_zero_optimizer = dict( # Enable low_level_optimzer overlap_communication overlap_sync_grad=True, - overlap_sync_param=True, + overlap_sync_param=False, # bucket size for nccl communication params reduce_bucket_size=512 * 1024 * 1024, # grad clipping diff --git a/configs/13B_train/131072_flash-attn_ckpt_False.py b/configs/13B_train/131072_flash-attn_ckpt_False.py deleted file mode 100644 index 28d51af6..00000000 --- a/configs/13B_train/131072_flash-attn_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/131072_flash-attn_ckpt_True.py b/configs/13B_train/131072_flash-attn_ckpt_True.py deleted file mode 100644 index 6d1b7ef0..00000000 --- a/configs/13B_train/131072_flash-attn_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/131072_flash_ckpt_False.py b/configs/13B_train/131072_flash_ckpt_False.py deleted file mode 100644 index dd0f0e89..00000000 --- a/configs/13B_train/131072_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/131072_flash_ckpt_True.py b/configs/13B_train/131072_flash_ckpt_True.py deleted file mode 100644 index 2b9276db..00000000 --- a/configs/13B_train/131072_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/131072_intern_ckpt_False.py b/configs/13B_train/131072_intern_ckpt_False.py deleted file mode 100644 index 182e4ddb..00000000 --- a/configs/13B_train/131072_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/131072_intern_ckpt_True.py b/configs/13B_train/131072_intern_ckpt_True.py deleted file mode 100644 index c23a3c10..00000000 --- a/configs/13B_train/131072_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/131072_megatron_ckpt_False.py b/configs/13B_train/131072_megatron_ckpt_False.py deleted file mode 100644 index 935ff98d..00000000 --- a/configs/13B_train/131072_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/131072_megatron_ckpt_True.py b/configs/13B_train/131072_megatron_ckpt_True.py deleted file mode 100644 index 441166c2..00000000 --- a/configs/13B_train/131072_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/131072_none_ckpt_False.py b/configs/13B_train/131072_none_ckpt_False.py deleted file mode 100644 index e43d6044..00000000 --- a/configs/13B_train/131072_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/131072_none_ckpt_True.py b/configs/13B_train/131072_none_ckpt_True.py deleted file mode 100644 index 0945dbdc..00000000 --- a/configs/13B_train/131072_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/16384_flash-attn_ckpt_False.py b/configs/13B_train/16384_flash-attn_ckpt_False.py deleted file mode 100644 index 393e54d3..00000000 --- a/configs/13B_train/16384_flash-attn_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/16384_flash-attn_ckpt_True.py b/configs/13B_train/16384_flash-attn_ckpt_True.py deleted file mode 100644 index 7f7e7ac6..00000000 --- a/configs/13B_train/16384_flash-attn_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/16384_flash_ckpt_False.py b/configs/13B_train/16384_flash_ckpt_False.py deleted file mode 100644 index cadd215f..00000000 --- a/configs/13B_train/16384_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/16384_flash_ckpt_True.py b/configs/13B_train/16384_flash_ckpt_True.py deleted file mode 100644 index c60ea730..00000000 --- a/configs/13B_train/16384_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/16384_intern_ckpt_False.py b/configs/13B_train/16384_intern_ckpt_False.py deleted file mode 100644 index e5d6fa6b..00000000 --- a/configs/13B_train/16384_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/16384_intern_ckpt_True.py b/configs/13B_train/16384_intern_ckpt_True.py deleted file mode 100644 index 6ac47ac2..00000000 --- a/configs/13B_train/16384_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/16384_megatron_ckpt_False.py b/configs/13B_train/16384_megatron_ckpt_False.py deleted file mode 100644 index 24429ead..00000000 --- a/configs/13B_train/16384_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/16384_megatron_ckpt_True.py b/configs/13B_train/16384_megatron_ckpt_True.py deleted file mode 100644 index d79c8207..00000000 --- a/configs/13B_train/16384_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/16384_none_ckpt_False.py b/configs/13B_train/16384_none_ckpt_False.py deleted file mode 100644 index a30d713a..00000000 --- a/configs/13B_train/16384_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/16384_none_ckpt_True.py b/configs/13B_train/16384_none_ckpt_True.py deleted file mode 100644 index 76483257..00000000 --- a/configs/13B_train/16384_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/262144_flash-attn_ckpt_False.py b/configs/13B_train/262144_flash-attn_ckpt_False.py deleted file mode 100644 index fd0be6a7..00000000 --- a/configs/13B_train/262144_flash-attn_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/262144_flash_ckpt_False.py b/configs/13B_train/262144_flash_ckpt_False.py deleted file mode 100644 index 5ca332ef..00000000 --- a/configs/13B_train/262144_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/262144_flash_ckpt_True.py b/configs/13B_train/262144_flash_ckpt_True.py deleted file mode 100644 index f990655a..00000000 --- a/configs/13B_train/262144_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/262144_intern_ckpt_False.py b/configs/13B_train/262144_intern_ckpt_False.py deleted file mode 100644 index 7ebcf94f..00000000 --- a/configs/13B_train/262144_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/262144_intern_ckpt_True.py b/configs/13B_train/262144_intern_ckpt_True.py deleted file mode 100644 index e958ac06..00000000 --- a/configs/13B_train/262144_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/262144_megatron_ckpt_False.py b/configs/13B_train/262144_megatron_ckpt_False.py deleted file mode 100644 index 31e96f78..00000000 --- a/configs/13B_train/262144_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/262144_megatron_ckpt_True.py b/configs/13B_train/262144_megatron_ckpt_True.py deleted file mode 100644 index 2339244b..00000000 --- a/configs/13B_train/262144_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/262144_none_ckpt_False.py b/configs/13B_train/262144_none_ckpt_False.py deleted file mode 100644 index 41d55e91..00000000 --- a/configs/13B_train/262144_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/262144_none_ckpt_True.py b/configs/13B_train/262144_none_ckpt_True.py deleted file mode 100644 index 4f2da605..00000000 --- a/configs/13B_train/262144_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/32768_flash-attn_ckpt_False.py b/configs/13B_train/32768_flash-attn_ckpt_False.py deleted file mode 100644 index 3eb0f493..00000000 --- a/configs/13B_train/32768_flash-attn_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/32768_flash-attn_ckpt_True.py b/configs/13B_train/32768_flash-attn_ckpt_True.py deleted file mode 100644 index 26b06ef3..00000000 --- a/configs/13B_train/32768_flash-attn_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/32768_flash_ckpt_False.py b/configs/13B_train/32768_flash_ckpt_False.py deleted file mode 100644 index da30a4dd..00000000 --- a/configs/13B_train/32768_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/32768_flash_ckpt_True.py b/configs/13B_train/32768_flash_ckpt_True.py deleted file mode 100644 index 20d415a5..00000000 --- a/configs/13B_train/32768_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/32768_intern_ckpt_False.py b/configs/13B_train/32768_intern_ckpt_False.py deleted file mode 100644 index 05ab5285..00000000 --- a/configs/13B_train/32768_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/32768_intern_ckpt_True.py b/configs/13B_train/32768_intern_ckpt_True.py deleted file mode 100644 index 273a812d..00000000 --- a/configs/13B_train/32768_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/32768_megatron_ckpt_False.py b/configs/13B_train/32768_megatron_ckpt_False.py deleted file mode 100644 index c8db542d..00000000 --- a/configs/13B_train/32768_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/32768_megatron_ckpt_True.py b/configs/13B_train/32768_megatron_ckpt_True.py deleted file mode 100644 index 9ff56012..00000000 --- a/configs/13B_train/32768_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/32768_none_ckpt_False.py b/configs/13B_train/32768_none_ckpt_False.py deleted file mode 100644 index a02e0711..00000000 --- a/configs/13B_train/32768_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/32768_none_ckpt_True.py b/configs/13B_train/32768_none_ckpt_True.py deleted file mode 100644 index b9b17e3c..00000000 --- a/configs/13B_train/32768_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/4096_flash-attn_ckpt_False.py b/configs/13B_train/4096_flash-attn_ckpt_False.py deleted file mode 100644 index 8e4459ea..00000000 --- a/configs/13B_train/4096_flash-attn_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/4096_flash-attn_ckpt_True.py b/configs/13B_train/4096_flash-attn_ckpt_True.py deleted file mode 100644 index a8f5e39b..00000000 --- a/configs/13B_train/4096_flash-attn_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/4096_flash_ckpt_False.py b/configs/13B_train/4096_flash_ckpt_False.py deleted file mode 100644 index 517b46e4..00000000 --- a/configs/13B_train/4096_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/4096_flash_ckpt_True.py b/configs/13B_train/4096_flash_ckpt_True.py deleted file mode 100644 index eacfcdfd..00000000 --- a/configs/13B_train/4096_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/4096_intern_ckpt_False.py b/configs/13B_train/4096_intern_ckpt_False.py deleted file mode 100644 index 5ecf2d66..00000000 --- a/configs/13B_train/4096_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/4096_intern_ckpt_True.py b/configs/13B_train/4096_intern_ckpt_True.py deleted file mode 100644 index b70acb01..00000000 --- a/configs/13B_train/4096_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/4096_megatron_ckpt_False.py b/configs/13B_train/4096_megatron_ckpt_False.py deleted file mode 100644 index 2e847a64..00000000 --- a/configs/13B_train/4096_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/4096_megatron_ckpt_True.py b/configs/13B_train/4096_megatron_ckpt_True.py deleted file mode 100644 index d8ba2c57..00000000 --- a/configs/13B_train/4096_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/4096_none_ckpt_False.py b/configs/13B_train/4096_none_ckpt_False.py deleted file mode 100644 index f8bbdfc5..00000000 --- a/configs/13B_train/4096_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/4096_none_ckpt_True.py b/configs/13B_train/4096_none_ckpt_True.py deleted file mode 100644 index d8f8ec7e..00000000 --- a/configs/13B_train/4096_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/65536_flash-attn_ckpt_False.py b/configs/13B_train/65536_flash-attn_ckpt_False.py deleted file mode 100644 index 09367f5a..00000000 --- a/configs/13B_train/65536_flash-attn_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/65536_flash-attn_ckpt_True.py b/configs/13B_train/65536_flash-attn_ckpt_True.py deleted file mode 100644 index dc283a92..00000000 --- a/configs/13B_train/65536_flash-attn_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/65536_flash_ckpt_False.py b/configs/13B_train/65536_flash_ckpt_False.py deleted file mode 100644 index 482d5114..00000000 --- a/configs/13B_train/65536_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/65536_flash_ckpt_True.py b/configs/13B_train/65536_flash_ckpt_True.py deleted file mode 100644 index 66051f83..00000000 --- a/configs/13B_train/65536_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/65536_intern_ckpt_False.py b/configs/13B_train/65536_intern_ckpt_False.py deleted file mode 100644 index f829652a..00000000 --- a/configs/13B_train/65536_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/65536_intern_ckpt_True.py b/configs/13B_train/65536_intern_ckpt_True.py deleted file mode 100644 index 4e94d0e3..00000000 --- a/configs/13B_train/65536_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/65536_megatron_ckpt_False.py b/configs/13B_train/65536_megatron_ckpt_False.py deleted file mode 100644 index a9293334..00000000 --- a/configs/13B_train/65536_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/65536_megatron_ckpt_True.py b/configs/13B_train/65536_megatron_ckpt_True.py deleted file mode 100644 index 845e32bc..00000000 --- a/configs/13B_train/65536_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/65536_none_ckpt_False.py b/configs/13B_train/65536_none_ckpt_False.py deleted file mode 100644 index 52ce3c52..00000000 --- a/configs/13B_train/65536_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/65536_none_ckpt_True.py b/configs/13B_train/65536_none_ckpt_True.py deleted file mode 100644 index de5532e1..00000000 --- a/configs/13B_train/65536_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/8192_flash-attn_ckpt_False.py b/configs/13B_train/8192_flash-attn_ckpt_False.py deleted file mode 100644 index 3324c290..00000000 --- a/configs/13B_train/8192_flash-attn_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/8192_flash-attn_ckpt_True.py b/configs/13B_train/8192_flash-attn_ckpt_True.py deleted file mode 100644 index 317e0f32..00000000 --- a/configs/13B_train/8192_flash-attn_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/8192_flash_ckpt_False.py b/configs/13B_train/8192_flash_ckpt_False.py deleted file mode 100644 index d645dc1b..00000000 --- a/configs/13B_train/8192_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/8192_flash_ckpt_True.py b/configs/13B_train/8192_flash_ckpt_True.py deleted file mode 100644 index 425859c0..00000000 --- a/configs/13B_train/8192_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "13b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/8192_intern_ckpt_False.py b/configs/13B_train/8192_intern_ckpt_False.py deleted file mode 100644 index 0b4fb8a2..00000000 --- a/configs/13B_train/8192_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/8192_intern_ckpt_True.py b/configs/13B_train/8192_intern_ckpt_True.py deleted file mode 100644 index b42cb769..00000000 --- a/configs/13B_train/8192_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/8192_megatron_ckpt_False.py b/configs/13B_train/8192_megatron_ckpt_False.py deleted file mode 100644 index e2191937..00000000 --- a/configs/13B_train/8192_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/8192_megatron_ckpt_True.py b/configs/13B_train/8192_megatron_ckpt_True.py deleted file mode 100644 index 5123c412..00000000 --- a/configs/13B_train/8192_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/8192_none_ckpt_False.py b/configs/13B_train/8192_none_ckpt_False.py deleted file mode 100644 index c9d9c050..00000000 --- a/configs/13B_train/8192_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/13B_train/8192_none_ckpt_True.py b/configs/13B_train/8192_none_ckpt_True.py deleted file mode 100644 index 182ec21f..00000000 --- a/configs/13B_train/8192_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 5120 -NUM_ATTENTION_HEAD = 40 -MLP_RATIO = 8 / 3 -NUM_LAYER = 40 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_template.py b/configs/30B_template.py index 7a32015e..4ac99bf0 100644 --- a/configs/30B_template.py +++ b/configs/30B_template.py @@ -2,7 +2,7 @@ DO_ALERT = False SEQ_LEN = {seq_len} -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({checkpoint}) +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint}) HIDDEN_SIZE = 6144 NUM_ATTENTION_HEAD = 48 MLP_RATIO = 8 / 3 @@ -50,9 +50,9 @@ data = dict( seq_len=SEQ_LEN, # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, + micro_num=1, # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, + micro_bsz=1, # defaults to the value of micro_num valid_micro_num=4, # defaults to 0, means disable evaluate @@ -91,7 +91,7 @@ hybrid_zero_optimizer = dict( # Enable low_level_optimzer overlap_communication overlap_sync_grad=True, - overlap_sync_param=True, + overlap_sync_param=False, # bucket size for nccl communication params reduce_bucket_size=512 * 1024 * 1024, # grad clipping diff --git a/configs/30B_train/131072_flash_ckpt_False.py b/configs/30B_train/131072_flash_ckpt_False.py deleted file mode 100644 index 3af48f3e..00000000 --- a/configs/30B_train/131072_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/131072_flash_ckpt_True.py b/configs/30B_train/131072_flash_ckpt_True.py deleted file mode 100644 index 4bd249bc..00000000 --- a/configs/30B_train/131072_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/131072_intern_ckpt_False.py b/configs/30B_train/131072_intern_ckpt_False.py deleted file mode 100644 index 77b176d2..00000000 --- a/configs/30B_train/131072_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/131072_intern_ckpt_True.py b/configs/30B_train/131072_intern_ckpt_True.py deleted file mode 100644 index 38a1db3b..00000000 --- a/configs/30B_train/131072_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/131072_megatron_ckpt_False.py b/configs/30B_train/131072_megatron_ckpt_False.py deleted file mode 100644 index 49879303..00000000 --- a/configs/30B_train/131072_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/131072_megatron_ckpt_True.py b/configs/30B_train/131072_megatron_ckpt_True.py deleted file mode 100644 index d911d381..00000000 --- a/configs/30B_train/131072_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/131072_none_ckpt_False.py b/configs/30B_train/131072_none_ckpt_False.py deleted file mode 100644 index 78b3c9a8..00000000 --- a/configs/30B_train/131072_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/131072_none_ckpt_True.py b/configs/30B_train/131072_none_ckpt_True.py deleted file mode 100644 index 941279e7..00000000 --- a/configs/30B_train/131072_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/16384_flash_ckpt_False.py b/configs/30B_train/16384_flash_ckpt_False.py deleted file mode 100644 index 779a10bc..00000000 --- a/configs/30B_train/16384_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/16384_flash_ckpt_True.py b/configs/30B_train/16384_flash_ckpt_True.py deleted file mode 100644 index 0498e2c4..00000000 --- a/configs/30B_train/16384_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/16384_intern_ckpt_False.py b/configs/30B_train/16384_intern_ckpt_False.py deleted file mode 100644 index 309a33f0..00000000 --- a/configs/30B_train/16384_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/16384_intern_ckpt_True.py b/configs/30B_train/16384_intern_ckpt_True.py deleted file mode 100644 index 23c977a5..00000000 --- a/configs/30B_train/16384_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/16384_megatron_ckpt_False.py b/configs/30B_train/16384_megatron_ckpt_False.py deleted file mode 100644 index 8576aa76..00000000 --- a/configs/30B_train/16384_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/16384_megatron_ckpt_True.py b/configs/30B_train/16384_megatron_ckpt_True.py deleted file mode 100644 index 460aba3b..00000000 --- a/configs/30B_train/16384_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/16384_none_ckpt_False.py b/configs/30B_train/16384_none_ckpt_False.py deleted file mode 100644 index 4ca50666..00000000 --- a/configs/30B_train/16384_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/16384_none_ckpt_True.py b/configs/30B_train/16384_none_ckpt_True.py deleted file mode 100644 index c7987e0d..00000000 --- a/configs/30B_train/16384_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/262144_flash_ckpt_False.py b/configs/30B_train/262144_flash_ckpt_False.py deleted file mode 100644 index 10d71d9c..00000000 --- a/configs/30B_train/262144_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/262144_flash_ckpt_True.py b/configs/30B_train/262144_flash_ckpt_True.py deleted file mode 100644 index a1990dbb..00000000 --- a/configs/30B_train/262144_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/262144_intern_ckpt_False.py b/configs/30B_train/262144_intern_ckpt_False.py deleted file mode 100644 index f8ec6a2f..00000000 --- a/configs/30B_train/262144_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/262144_intern_ckpt_True.py b/configs/30B_train/262144_intern_ckpt_True.py deleted file mode 100644 index c5afa46b..00000000 --- a/configs/30B_train/262144_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/262144_megatron_ckpt_False.py b/configs/30B_train/262144_megatron_ckpt_False.py deleted file mode 100644 index 412da179..00000000 --- a/configs/30B_train/262144_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/262144_megatron_ckpt_True.py b/configs/30B_train/262144_megatron_ckpt_True.py deleted file mode 100644 index 79affb19..00000000 --- a/configs/30B_train/262144_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/262144_none_ckpt_False.py b/configs/30B_train/262144_none_ckpt_False.py deleted file mode 100644 index e6fbe1eb..00000000 --- a/configs/30B_train/262144_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/262144_none_ckpt_True.py b/configs/30B_train/262144_none_ckpt_True.py deleted file mode 100644 index d507c30b..00000000 --- a/configs/30B_train/262144_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/32768_flash_ckpt_False.py b/configs/30B_train/32768_flash_ckpt_False.py deleted file mode 100644 index 6bac5b31..00000000 --- a/configs/30B_train/32768_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/32768_flash_ckpt_True.py b/configs/30B_train/32768_flash_ckpt_True.py deleted file mode 100644 index f21c9983..00000000 --- a/configs/30B_train/32768_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/32768_intern_ckpt_False.py b/configs/30B_train/32768_intern_ckpt_False.py deleted file mode 100644 index 79728d64..00000000 --- a/configs/30B_train/32768_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/32768_intern_ckpt_True.py b/configs/30B_train/32768_intern_ckpt_True.py deleted file mode 100644 index 6dc24c30..00000000 --- a/configs/30B_train/32768_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/32768_megatron_ckpt_False.py b/configs/30B_train/32768_megatron_ckpt_False.py deleted file mode 100644 index 37fd0986..00000000 --- a/configs/30B_train/32768_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/32768_megatron_ckpt_True.py b/configs/30B_train/32768_megatron_ckpt_True.py deleted file mode 100644 index 986b27dd..00000000 --- a/configs/30B_train/32768_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/32768_none_ckpt_False.py b/configs/30B_train/32768_none_ckpt_False.py deleted file mode 100644 index 9c6ca879..00000000 --- a/configs/30B_train/32768_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/32768_none_ckpt_True.py b/configs/30B_train/32768_none_ckpt_True.py deleted file mode 100644 index d4ab7f2d..00000000 --- a/configs/30B_train/32768_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/4096_flash_ckpt_False.py b/configs/30B_train/4096_flash_ckpt_False.py deleted file mode 100644 index 3dd8be56..00000000 --- a/configs/30B_train/4096_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/4096_flash_ckpt_True.py b/configs/30B_train/4096_flash_ckpt_True.py deleted file mode 100644 index 73150acf..00000000 --- a/configs/30B_train/4096_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/4096_intern_ckpt_False.py b/configs/30B_train/4096_intern_ckpt_False.py deleted file mode 100644 index cff6c5b6..00000000 --- a/configs/30B_train/4096_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/4096_intern_ckpt_True.py b/configs/30B_train/4096_intern_ckpt_True.py deleted file mode 100644 index 1fb64257..00000000 --- a/configs/30B_train/4096_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/4096_megatron_ckpt_False.py b/configs/30B_train/4096_megatron_ckpt_False.py deleted file mode 100644 index 79f718d0..00000000 --- a/configs/30B_train/4096_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/4096_megatron_ckpt_True.py b/configs/30B_train/4096_megatron_ckpt_True.py deleted file mode 100644 index 502ae7f7..00000000 --- a/configs/30B_train/4096_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/4096_none_ckpt_False.py b/configs/30B_train/4096_none_ckpt_False.py deleted file mode 100644 index 981a0f23..00000000 --- a/configs/30B_train/4096_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/4096_none_ckpt_True.py b/configs/30B_train/4096_none_ckpt_True.py deleted file mode 100644 index dddea663..00000000 --- a/configs/30B_train/4096_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/65536_flash_ckpt_False.py b/configs/30B_train/65536_flash_ckpt_False.py deleted file mode 100644 index babebd95..00000000 --- a/configs/30B_train/65536_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/65536_flash_ckpt_True.py b/configs/30B_train/65536_flash_ckpt_True.py deleted file mode 100644 index 064250e7..00000000 --- a/configs/30B_train/65536_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/65536_intern_ckpt_False.py b/configs/30B_train/65536_intern_ckpt_False.py deleted file mode 100644 index 64165f44..00000000 --- a/configs/30B_train/65536_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/65536_intern_ckpt_True.py b/configs/30B_train/65536_intern_ckpt_True.py deleted file mode 100644 index 78b66213..00000000 --- a/configs/30B_train/65536_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/65536_megatron_ckpt_False.py b/configs/30B_train/65536_megatron_ckpt_False.py deleted file mode 100644 index e8c09548..00000000 --- a/configs/30B_train/65536_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/65536_megatron_ckpt_True.py b/configs/30B_train/65536_megatron_ckpt_True.py deleted file mode 100644 index d3b64c41..00000000 --- a/configs/30B_train/65536_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/65536_none_ckpt_False.py b/configs/30B_train/65536_none_ckpt_False.py deleted file mode 100644 index ee4c7fb5..00000000 --- a/configs/30B_train/65536_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/65536_none_ckpt_True.py b/configs/30B_train/65536_none_ckpt_True.py deleted file mode 100644 index 2e84144c..00000000 --- a/configs/30B_train/65536_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/8192_flash_ckpt_False.py b/configs/30B_train/8192_flash_ckpt_False.py deleted file mode 100644 index b9eb6e65..00000000 --- a/configs/30B_train/8192_flash_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/8192_flash_ckpt_True.py b/configs/30B_train/8192_flash_ckpt_True.py deleted file mode 100644 index c0dd5175..00000000 --- a/configs/30B_train/8192_flash_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/8192_intern_ckpt_False.py b/configs/30B_train/8192_intern_ckpt_False.py deleted file mode 100644 index d915b6b8..00000000 --- a/configs/30B_train/8192_intern_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/8192_intern_ckpt_True.py b/configs/30B_train/8192_intern_ckpt_True.py deleted file mode 100644 index a71693a1..00000000 --- a/configs/30B_train/8192_intern_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/8192_megatron_ckpt_False.py b/configs/30B_train/8192_megatron_ckpt_False.py deleted file mode 100644 index dcacb9e5..00000000 --- a/configs/30B_train/8192_megatron_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/8192_megatron_ckpt_True.py b/configs/30B_train/8192_megatron_ckpt_True.py deleted file mode 100644 index b6e4ba24..00000000 --- a/configs/30B_train/8192_megatron_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/8192_none_ckpt_False.py b/configs/30B_train/8192_none_ckpt_False.py deleted file mode 100644 index ce790dfa..00000000 --- a/configs/30B_train/8192_none_ckpt_False.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/30B_train/8192_none_ckpt_True.py b/configs/30B_train/8192_none_ckpt_True.py deleted file mode 100644 index e6afcd4e..00000000 --- a/configs/30B_train/8192_none_ckpt_True.py +++ /dev/null @@ -1,180 +0,0 @@ - -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) -HIDDEN_SIZE = 6144 -NUM_ATTENTION_HEAD = 48 -MLP_RATIO = 8 / 3 -NUM_LAYER = 60 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=4, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=2, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=False, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=10, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=True, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', - the sequence_parallel should be True. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -sequence parallel (bool): enable/disable sequence parallel, defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_template.py b/configs/7B_template.py index b9f76a51..d78fc884 100644 --- a/configs/7B_template.py +++ b/configs/7B_template.py @@ -2,7 +2,7 @@ DO_ALERT = False SEQ_LEN = {seq_len} -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({checkpoint}) +JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint}) HIDDEN_SIZE = 4096 NUM_ATTENTION_HEAD = 32 MLP_RATIO = 8 / 3 diff --git a/configs/7B_train/131072_flash-attn_ckpt_False.py b/configs/7B_train/131072_flash-attn_ckpt_False.py deleted file mode 100644 index 047fb372..00000000 --- a/configs/7B_train/131072_flash-attn_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/131072_flash-attn_ckpt_True.py b/configs/7B_train/131072_flash-attn_ckpt_True.py deleted file mode 100644 index 763627d6..00000000 --- a/configs/7B_train/131072_flash-attn_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/131072_flash_ckpt_False.py b/configs/7B_train/131072_flash_ckpt_False.py deleted file mode 100644 index 4307e9d1..00000000 --- a/configs/7B_train/131072_flash_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/131072_flash_ckpt_True.py b/configs/7B_train/131072_flash_ckpt_True.py deleted file mode 100644 index c110b256..00000000 --- a/configs/7B_train/131072_flash_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/131072_intern_ckpt_False.py b/configs/7B_train/131072_intern_ckpt_False.py deleted file mode 100644 index 1d728be7..00000000 --- a/configs/7B_train/131072_intern_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/131072_intern_ckpt_True.py b/configs/7B_train/131072_intern_ckpt_True.py deleted file mode 100644 index 45d4aa01..00000000 --- a/configs/7B_train/131072_intern_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/131072_megatron_ckpt_False.py b/configs/7B_train/131072_megatron_ckpt_False.py deleted file mode 100644 index 0bd98459..00000000 --- a/configs/7B_train/131072_megatron_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/131072_megatron_ckpt_True.py b/configs/7B_train/131072_megatron_ckpt_True.py deleted file mode 100644 index 9200afbe..00000000 --- a/configs/7B_train/131072_megatron_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/131072_none_ckpt_False.py b/configs/7B_train/131072_none_ckpt_False.py deleted file mode 100644 index 16059fb1..00000000 --- a/configs/7B_train/131072_none_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/131072_none_ckpt_True.py b/configs/7B_train/131072_none_ckpt_True.py deleted file mode 100644 index 35b3f08e..00000000 --- a/configs/7B_train/131072_none_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 131072 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/16384_flash-attn_ckpt_False.py b/configs/7B_train/16384_flash-attn_ckpt_False.py deleted file mode 100644 index 53a64b99..00000000 --- a/configs/7B_train/16384_flash-attn_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/16384_flash-attn_ckpt_True.py b/configs/7B_train/16384_flash-attn_ckpt_True.py deleted file mode 100644 index cdb051e5..00000000 --- a/configs/7B_train/16384_flash-attn_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/16384_flash_ckpt_False.py b/configs/7B_train/16384_flash_ckpt_False.py deleted file mode 100644 index 41b39515..00000000 --- a/configs/7B_train/16384_flash_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/16384_flash_ckpt_True.py b/configs/7B_train/16384_flash_ckpt_True.py deleted file mode 100644 index ca2c7f06..00000000 --- a/configs/7B_train/16384_flash_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/16384_intern_ckpt_False.py b/configs/7B_train/16384_intern_ckpt_False.py deleted file mode 100644 index 93abb682..00000000 --- a/configs/7B_train/16384_intern_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/16384_intern_ckpt_True.py b/configs/7B_train/16384_intern_ckpt_True.py deleted file mode 100644 index af9d9945..00000000 --- a/configs/7B_train/16384_intern_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/16384_megatron_ckpt_False.py b/configs/7B_train/16384_megatron_ckpt_False.py deleted file mode 100644 index d2c58d3a..00000000 --- a/configs/7B_train/16384_megatron_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/16384_megatron_ckpt_True.py b/configs/7B_train/16384_megatron_ckpt_True.py deleted file mode 100644 index 6e372b8c..00000000 --- a/configs/7B_train/16384_megatron_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/16384_none_ckpt_False.py b/configs/7B_train/16384_none_ckpt_False.py deleted file mode 100644 index 0fd65900..00000000 --- a/configs/7B_train/16384_none_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/16384_none_ckpt_True.py b/configs/7B_train/16384_none_ckpt_True.py deleted file mode 100644 index 6ea5e1a9..00000000 --- a/configs/7B_train/16384_none_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 16384 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/262144_flash-attn_ckpt_False.py b/configs/7B_train/262144_flash-attn_ckpt_False.py deleted file mode 100644 index 6dad9730..00000000 --- a/configs/7B_train/262144_flash-attn_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/262144_flash-attn_ckpt_True.py b/configs/7B_train/262144_flash-attn_ckpt_True.py deleted file mode 100644 index cacd9737..00000000 --- a/configs/7B_train/262144_flash-attn_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/262144_flash_ckpt_False.py b/configs/7B_train/262144_flash_ckpt_False.py deleted file mode 100644 index 0e9b0173..00000000 --- a/configs/7B_train/262144_flash_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/262144_flash_ckpt_True.py b/configs/7B_train/262144_flash_ckpt_True.py deleted file mode 100644 index ddacc8df..00000000 --- a/configs/7B_train/262144_flash_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/262144_intern_ckpt_False.py b/configs/7B_train/262144_intern_ckpt_False.py deleted file mode 100644 index e5cf7694..00000000 --- a/configs/7B_train/262144_intern_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/262144_intern_ckpt_True.py b/configs/7B_train/262144_intern_ckpt_True.py deleted file mode 100644 index 76f9386a..00000000 --- a/configs/7B_train/262144_intern_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/262144_megatron_ckpt_False.py b/configs/7B_train/262144_megatron_ckpt_False.py deleted file mode 100644 index b929f9a6..00000000 --- a/configs/7B_train/262144_megatron_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/262144_megatron_ckpt_True.py b/configs/7B_train/262144_megatron_ckpt_True.py deleted file mode 100644 index 1655631c..00000000 --- a/configs/7B_train/262144_megatron_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/262144_none_ckpt_False.py b/configs/7B_train/262144_none_ckpt_False.py deleted file mode 100644 index 85512f07..00000000 --- a/configs/7B_train/262144_none_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/262144_none_ckpt_True.py b/configs/7B_train/262144_none_ckpt_True.py deleted file mode 100644 index fef559bd..00000000 --- a/configs/7B_train/262144_none_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 262144 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/32768_flash-attn_ckpt_False.py b/configs/7B_train/32768_flash-attn_ckpt_False.py deleted file mode 100644 index f2664be8..00000000 --- a/configs/7B_train/32768_flash-attn_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/32768_flash-attn_ckpt_True.py b/configs/7B_train/32768_flash-attn_ckpt_True.py deleted file mode 100644 index 232b5904..00000000 --- a/configs/7B_train/32768_flash-attn_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/32768_flash_ckpt_False.py b/configs/7B_train/32768_flash_ckpt_False.py deleted file mode 100644 index 878b9ac1..00000000 --- a/configs/7B_train/32768_flash_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/32768_flash_ckpt_True.py b/configs/7B_train/32768_flash_ckpt_True.py deleted file mode 100644 index 27cffd02..00000000 --- a/configs/7B_train/32768_flash_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/32768_intern_ckpt_False.py b/configs/7B_train/32768_intern_ckpt_False.py deleted file mode 100644 index fcf84197..00000000 --- a/configs/7B_train/32768_intern_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/32768_intern_ckpt_True.py b/configs/7B_train/32768_intern_ckpt_True.py deleted file mode 100644 index aec2b68b..00000000 --- a/configs/7B_train/32768_intern_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/32768_megatron_ckpt_False.py b/configs/7B_train/32768_megatron_ckpt_False.py deleted file mode 100644 index 64caeeb5..00000000 --- a/configs/7B_train/32768_megatron_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/32768_megatron_ckpt_True.py b/configs/7B_train/32768_megatron_ckpt_True.py deleted file mode 100644 index a736e7d0..00000000 --- a/configs/7B_train/32768_megatron_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/32768_none_ckpt_False.py b/configs/7B_train/32768_none_ckpt_False.py deleted file mode 100644 index 3a31776e..00000000 --- a/configs/7B_train/32768_none_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/32768_none_ckpt_True.py b/configs/7B_train/32768_none_ckpt_True.py deleted file mode 100644 index 4ac09249..00000000 --- a/configs/7B_train/32768_none_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 32768 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/4096_flash-attn_ckpt_False.py b/configs/7B_train/4096_flash-attn_ckpt_False.py deleted file mode 100644 index b3de8990..00000000 --- a/configs/7B_train/4096_flash-attn_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/4096_flash-attn_ckpt_True.py b/configs/7B_train/4096_flash-attn_ckpt_True.py deleted file mode 100644 index b44b103f..00000000 --- a/configs/7B_train/4096_flash-attn_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/4096_flash_ckpt_False.py b/configs/7B_train/4096_flash_ckpt_False.py deleted file mode 100644 index 8ac542d6..00000000 --- a/configs/7B_train/4096_flash_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/4096_flash_ckpt_True.py b/configs/7B_train/4096_flash_ckpt_True.py deleted file mode 100644 index ec477f68..00000000 --- a/configs/7B_train/4096_flash_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/4096_intern_ckpt_False.py b/configs/7B_train/4096_intern_ckpt_False.py deleted file mode 100644 index f16f95ad..00000000 --- a/configs/7B_train/4096_intern_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/4096_intern_ckpt_True.py b/configs/7B_train/4096_intern_ckpt_True.py deleted file mode 100644 index 90fed7c8..00000000 --- a/configs/7B_train/4096_intern_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/4096_megatron_ckpt_False.py b/configs/7B_train/4096_megatron_ckpt_False.py deleted file mode 100644 index ca41fa28..00000000 --- a/configs/7B_train/4096_megatron_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/4096_megatron_ckpt_True.py b/configs/7B_train/4096_megatron_ckpt_True.py deleted file mode 100644 index 45183156..00000000 --- a/configs/7B_train/4096_megatron_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/4096_none_ckpt_False.py b/configs/7B_train/4096_none_ckpt_False.py deleted file mode 100644 index c81bb5b9..00000000 --- a/configs/7B_train/4096_none_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/4096_none_ckpt_True.py b/configs/7B_train/4096_none_ckpt_True.py deleted file mode 100644 index a25d222f..00000000 --- a/configs/7B_train/4096_none_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 4096 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/65536_flash-attn_ckpt_False.py b/configs/7B_train/65536_flash-attn_ckpt_False.py deleted file mode 100644 index 3d5a81eb..00000000 --- a/configs/7B_train/65536_flash-attn_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/65536_flash-attn_ckpt_True.py b/configs/7B_train/65536_flash-attn_ckpt_True.py deleted file mode 100644 index c6982c98..00000000 --- a/configs/7B_train/65536_flash-attn_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/65536_flash_ckpt_False.py b/configs/7B_train/65536_flash_ckpt_False.py deleted file mode 100644 index 0cfea813..00000000 --- a/configs/7B_train/65536_flash_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/65536_flash_ckpt_True.py b/configs/7B_train/65536_flash_ckpt_True.py deleted file mode 100644 index abdeb49d..00000000 --- a/configs/7B_train/65536_flash_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/65536_intern_ckpt_False.py b/configs/7B_train/65536_intern_ckpt_False.py deleted file mode 100644 index 2e0b27e1..00000000 --- a/configs/7B_train/65536_intern_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/65536_intern_ckpt_True.py b/configs/7B_train/65536_intern_ckpt_True.py deleted file mode 100644 index d1a8de7c..00000000 --- a/configs/7B_train/65536_intern_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/65536_megatron_ckpt_False.py b/configs/7B_train/65536_megatron_ckpt_False.py deleted file mode 100644 index 7de7b92d..00000000 --- a/configs/7B_train/65536_megatron_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/65536_megatron_ckpt_True.py b/configs/7B_train/65536_megatron_ckpt_True.py deleted file mode 100644 index b339c833..00000000 --- a/configs/7B_train/65536_megatron_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/65536_none_ckpt_False.py b/configs/7B_train/65536_none_ckpt_False.py deleted file mode 100644 index b8c44769..00000000 --- a/configs/7B_train/65536_none_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/65536_none_ckpt_True.py b/configs/7B_train/65536_none_ckpt_True.py deleted file mode 100644 index b907e437..00000000 --- a/configs/7B_train/65536_none_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 65536 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/8192_flash-attn_ckpt_False.py b/configs/7B_train/8192_flash-attn_ckpt_False.py deleted file mode 100644 index d0ddd438..00000000 --- a/configs/7B_train/8192_flash-attn_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/8192_flash-attn_ckpt_True.py b/configs/7B_train/8192_flash-attn_ckpt_True.py deleted file mode 100644 index d9e5b2f9..00000000 --- a/configs/7B_train/8192_flash-attn_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("flash-attn") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash-attn", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/8192_flash_ckpt_False.py b/configs/7B_train/8192_flash_ckpt_False.py deleted file mode 100644 index 69546d11..00000000 --- a/configs/7B_train/8192_flash_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/8192_flash_ckpt_True.py b/configs/7B_train/8192_flash_ckpt_True.py deleted file mode 100644 index 4c7f9864..00000000 --- a/configs/7B_train/8192_flash_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="flash", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/8192_intern_ckpt_False.py b/configs/7B_train/8192_intern_ckpt_False.py deleted file mode 100644 index 9694ad81..00000000 --- a/configs/7B_train/8192_intern_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/8192_intern_ckpt_True.py b/configs/7B_train/8192_intern_ckpt_True.py deleted file mode 100644 index 99a0fc18..00000000 --- a/configs/7B_train/8192_intern_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("intern") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="intern", intern_overlap=True), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/8192_megatron_ckpt_False.py b/configs/7B_train/8192_megatron_ckpt_False.py deleted file mode 100644 index f18ee730..00000000 --- a/configs/7B_train/8192_megatron_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/8192_megatron_ckpt_True.py b/configs/7B_train/8192_megatron_ckpt_True.py deleted file mode 100644 index 1db58412..00000000 --- a/configs/7B_train/8192_megatron_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("megatron") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="megatron", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/8192_none_ckpt_False.py b/configs/7B_train/8192_none_ckpt_False.py deleted file mode 100644 index 95d686bb..00000000 --- a/configs/7B_train/8192_none_ckpt_False.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(False) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/7B_train/8192_none_ckpt_True.py b/configs/7B_train/8192_none_ckpt_True.py deleted file mode 100644 index a63b6f20..00000000 --- a/configs/7B_train/8192_none_ckpt_True.py +++ /dev/null @@ -1,181 +0,0 @@ -# JOB_NAME = "7b_train" -DO_ALERT = False - -SEQ_LEN = 8192 -JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str("none") + "_" + str(True) -HIDDEN_SIZE = 4096 -NUM_ATTENTION_HEAD = 32 -MLP_RATIO = 8 / 3 -NUM_LAYER = 32 -VOCAB_SIZE = 103168 - -MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" -# Ckpt folder format: -# fs: 'local:/mnt/nfs/XXX' -SAVE_CKPT_FOLDER = "local:llm_ckpts" -LOAD_CKPT_FOLDER = "local:llm_ckpts/49" - -# boto3 Ckpt folder format: -# import os -# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint -# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" -# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" -CHECKPOINT_EVERY = 50 -ckpt = dict( - enable_save_ckpt=False, # enable ckpt save. - save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), - load_ckpt_folder="local:llm_ckpts/", - # 'load_ckpt_info' setting guide: - # 1. the 'path' indicate ckpt path, - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), - # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering - # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) - # with an automatic restart mechanism upon training reboot. - # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint - # path specified in `load_ckpt_info` by default. - # If you want to initialize your model weights from another model, you must set `auto_resume` to False. - # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. - auto_resume=True, - checkpoint_every=CHECKPOINT_EVERY, - async_upload=True, # async ckpt upload. (only work for boto3 ckpt) - async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. - oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. -) - -TRAIN_FOLDER = "/path/to/dataset" -VALID_FOLDER = "/path/to/dataset" -data = dict( - seq_len=SEQ_LEN, - # micro_num means the number of micro_batch contained in one gradient update - micro_num=1, - # packed_length = micro_bsz * SEQ_LEN - micro_bsz=1, - # defaults to the value of micro_num - valid_micro_num=4, - # defaults to 0, means disable evaluate - valid_every=50, - pack_sample_into_one=True, - total_steps=20, - skip_batches="", - rampup_batch_size="", - # Datasets with less than 50 rows will be discarded - min_length=50, - # train_folder=TRAIN_FOLDER, - # valid_folder=VALID_FOLDER, - empty_cache_and_diag_interval=100, - diag_outlier_ratio=1.1, -) - -grad_scaler = dict( - fp16=dict( - # the initial loss scale, defaults to 2**16 - initial_scale=2**16, - # the minimum loss scale, defaults to None - min_scale=1, - # the number of steps to increase loss scale when no overflow occurs - growth_interval=1000, - ), - # the multiplication factor for increasing loss scale, defaults to 2 - growth_factor=2, - # the multiplication factor for decreasing loss scale, defaults to 0.5 - backoff_factor=0.5, - # the maximum loss scale, defaults to None - max_scale=2**24, - # the number of overflows before decreasing loss scale, defaults to 2 - hysteresis=2, -) - -hybrid_zero_optimizer = dict( - # Enable low_level_optimzer overlap_communication - overlap_sync_grad=True, - overlap_sync_param=False, - # bucket size for nccl communication params - reduce_bucket_size=512 * 1024 * 1024, - # grad clipping - clip_grad_norm=1.0, -) - -loss = dict( - label_smoothing=0, -) - -adam = dict( - lr=1e-4, - adam_beta1=0.9, - adam_beta2=0.95, - adam_beta2_c=0, - adam_eps=1e-8, - weight_decay=0.01, -) - -lr_scheduler = dict( - total_steps=data["total_steps"], - init_steps=0, # optimizer_warmup_step - warmup_ratio=0.01, - eta_min=1e-5, - last_epoch=-1, -) - -beta2_scheduler = dict( - init_beta2=adam["adam_beta2"], - c=adam["adam_beta2_c"], - cur_iter=-1, -) - -model = dict( - checkpoint=True, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] - num_attention_heads=NUM_ATTENTION_HEAD, - embed_split_hidden=True, - vocab_size=VOCAB_SIZE, - embed_grad_scale=1, - parallel_output=True, - hidden_size=HIDDEN_SIZE, - num_layers=NUM_LAYER, - mlp_ratio=MLP_RATIO, - apply_post_layer_norm=False, - dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" - norm_type="rmsnorm", - layer_norm_epsilon=1e-5, - use_flash_attn=True, - num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. -) -""" -zero1 parallel (dict): - 1. size: int - * if size <= 0, the size of the zero process group is equal to the size of the dp process group, - so parameters will be divided within the range of dp. - * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. - * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. - For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. - 2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. -tensor parallel (dict): - 1. size: int, the size of tensor parallel. - 2. sp: str, the sequence parallel mode, should be in ['none', 'megatron', 'flash-attn', 'intern'], - defaults to 'none', means the sequence parallel will be disabled. - 3. intern_overlap: bool, enable/disable all_gather/reduce_scatter communication overlap when using 'intern' mode sp, - defaults to False. -pipeline parallel (dict): - 1. size: int, the size of pipeline parallel. - 2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, - defaults to False. -""" -parallel = dict( - zero1=dict(size=-1, fsdp=False), - tensor=dict(size=8, sp="none", intern_overlap=False), - pipeline=dict(size=1, interleaved_overlap=True), -) - -cudnn_deterministic = False -cudnn_benchmark = False - -monitor = dict( - # feishu alert configs - alert=dict( - enable_feishu_alert=DO_ALERT, - feishu_alert_address=None, # feishu webhook to send alert message - light_monitor_address=None, # light_monitor address to send heartbeat - ), -) diff --git a/configs/generate.py b/configs/generate.py index 6a58f098..a8a5898a 100644 --- a/configs/generate.py +++ b/configs/generate.py @@ -6,8 +6,8 @@ root_names = ["7B_train_", "13B_train_", "30B_train_"] model_size = ["7B", "13B", "30B"] seq_length = [4096, 8192, 16384, 32768, 65536, 131072, 262144] -sp = ["none", "megatron", "flash-attn", "intern"] -intern_overlap = [False, False, False, True] +sp = ["none", "megatron", "flash-attn", "intern", "intern"] +intern_overlap = [False, False, False, True, False] checkpoint = [False, True] for idx, root_name in enumerate(root_names): @@ -32,13 +32,29 @@ line = line.replace("{sp}", f"\"{sp_mode}\"") line = line.replace("{intern_overlap}", str(intern_overlap[i])) line = line.replace("{checkpoint}", str(ckpt)) - output_file_name = str(seq) + "_" + str(sp_mode) + "_ckpt_" + str(ckpt) + ".py" + output_file_name = str(seq) + "_" + str(sp_mode) + "_overlap_" + str(intern_overlap[i]) + "_ckpt_" + str(ckpt) + ".py" write_file = folder_path + "/" + output_file_name with open(write_file, "w") as file: file.write(line) log_name = root_name + "_" + output_file_name[:-3] - command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=10 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log" + skip = True + + if idx == 0 and i == 4: # 7b, intern_overlap = False + skip = False + if idx == 0 and ckpt is True and i == 3: # 7b, ckpt = True + skip = False + if idx == 1: # 13b + skip = False + if idx == 2: # 30b + skip = False + + if skip: + import time; time.sleep(1) + print(f"skip {log_name}", flush=True) + continue + + command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=20 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log" process = subprocess.Popen(command, shell=True, executable='/bin/bash') process.wait() \ No newline at end of file