forked from InternLM/InternLM
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
5abe519
commit 16ef7b7
Showing
6 changed files
with
547 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
JOB_NAME = "13b_train" | ||
DO_ALERT = False | ||
|
||
SEQ_LEN = 4096 | ||
HIDDEN_SIZE = 5120 | ||
NUM_ATTENTION_HEAD = 40 | ||
MLP_RATIO = 8 / 3 | ||
NUM_LAYER = 40 | ||
VOCAB_SIZE = 103168 | ||
|
||
MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" | ||
# Ckpt folder format: | ||
# fs: 'local:/mnt/nfs/XXX' | ||
SAVE_CKPT_FOLDER = "local:llm_ckpts" | ||
LOAD_CKPT_FOLDER = "local:llm_ckpts/49" | ||
|
||
# boto3 Ckpt folder format: | ||
# import os | ||
# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint | ||
# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" | ||
# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" | ||
CHECKPOINT_EVERY = 50 | ||
ckpt = dict( | ||
enable_save_ckpt=False, # enable ckpt save. | ||
save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. | ||
# load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), | ||
load_ckpt_folder="local:llm_ckpts/", | ||
# 'load_ckpt_info' setting guide: | ||
# 1. the 'path' indicate ckpt path, | ||
# 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" | ||
# 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. | ||
load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), | ||
# 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering | ||
# training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) | ||
# with an automatic restart mechanism upon training reboot. | ||
# Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint | ||
# path specified in `load_ckpt_info` by default. | ||
# If you want to initialize your model weights from another model, you must set `auto_resume` to False. | ||
# If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. | ||
auto_resume=True, | ||
checkpoint_every=CHECKPOINT_EVERY, | ||
async_upload=True, # async ckpt upload. (only work for boto3 ckpt) | ||
async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. | ||
oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. | ||
) | ||
|
||
TRAIN_FOLDER = "/path/to/dataset" | ||
VALID_FOLDER = "/path/to/dataset" | ||
data = dict( | ||
seq_len=SEQ_LEN, | ||
# micro_num means the number of micro_batch contained in one gradient update | ||
micro_num=4, | ||
# packed_length = micro_bsz * SEQ_LEN | ||
micro_bsz=2, | ||
# defaults to the value of micro_num | ||
valid_micro_num=4, | ||
# defaults to 0, means disable evaluate | ||
valid_every=50, | ||
pack_sample_into_one=False, | ||
total_steps=20, | ||
skip_batches="", | ||
rampup_batch_size="", | ||
# Datasets with less than 50 rows will be discarded | ||
min_length=50, | ||
# train_folder=TRAIN_FOLDER, | ||
# valid_folder=VALID_FOLDER, | ||
empty_cache_and_diag_interval=10, | ||
diag_outlier_ratio=1.1, | ||
) | ||
|
||
grad_scaler = dict( | ||
fp16=dict( | ||
# the initial loss scale, defaults to 2**16 | ||
initial_scale=2**16, | ||
# the minimum loss scale, defaults to None | ||
min_scale=1, | ||
# the number of steps to increase loss scale when no overflow occurs | ||
growth_interval=1000, | ||
), | ||
# the multiplication factor for increasing loss scale, defaults to 2 | ||
growth_factor=2, | ||
# the multiplication factor for decreasing loss scale, defaults to 0.5 | ||
backoff_factor=0.5, | ||
# the maximum loss scale, defaults to None | ||
max_scale=2**24, | ||
# the number of overflows before decreasing loss scale, defaults to 2 | ||
hysteresis=2, | ||
) | ||
|
||
hybrid_zero_optimizer = dict( | ||
# Enable low_level_optimzer overlap_communication | ||
overlap_sync_grad=True, | ||
overlap_sync_param=True, | ||
# bucket size for nccl communication params | ||
reduce_bucket_size=512 * 1024 * 1024, | ||
# grad clipping | ||
clip_grad_norm=1.0, | ||
) | ||
|
||
loss = dict( | ||
label_smoothing=0, | ||
) | ||
|
||
adam = dict( | ||
lr=1e-4, | ||
adam_beta1=0.9, | ||
adam_beta2=0.95, | ||
adam_beta2_c=0, | ||
adam_eps=1e-8, | ||
weight_decay=0.01, | ||
) | ||
|
||
lr_scheduler = dict( | ||
total_steps=data["total_steps"], | ||
init_steps=0, # optimizer_warmup_step | ||
warmup_ratio=0.01, | ||
eta_min=1e-5, | ||
last_epoch=-1, | ||
) | ||
|
||
beta2_scheduler = dict( | ||
init_beta2=adam["adam_beta2"], | ||
c=adam["adam_beta2_c"], | ||
cur_iter=-1, | ||
) | ||
|
||
model = dict( | ||
checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] | ||
num_attention_heads=NUM_ATTENTION_HEAD, | ||
embed_split_hidden=True, | ||
vocab_size=VOCAB_SIZE, | ||
embed_grad_scale=1, | ||
parallel_output=True, | ||
hidden_size=HIDDEN_SIZE, | ||
num_layers=NUM_LAYER, | ||
mlp_ratio=MLP_RATIO, | ||
apply_post_layer_norm=False, | ||
dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" | ||
norm_type="rmsnorm", | ||
layer_norm_epsilon=1e-5, | ||
use_flash_attn=True, | ||
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. | ||
) | ||
""" | ||
zero1 parallel (dict): | ||
1. size: int | ||
* if size <= 0, the size of the zero process group is equal to the size of the dp process group, | ||
so parameters will be divided within the range of dp. | ||
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. | ||
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. | ||
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. | ||
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. | ||
tensor parallel (dict): | ||
1. size: int, the size of tensor parallel. | ||
2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', | ||
the sequence_parallel should be True. | ||
pipeline parallel (dict): | ||
1. size: int, the size of pipeline parallel. | ||
2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, | ||
defaults to False. | ||
sequence parallel (bool): enable/disable sequence parallel, defaults to False. | ||
""" | ||
parallel = dict( | ||
zero1=dict(size=-1, fsdp=False), | ||
tensor=dict(size=8, mode="origin_tp"), | ||
pipeline=dict(size=1, interleaved_overlap=True), | ||
sequence_parallel=True, | ||
) | ||
|
||
cudnn_deterministic = False | ||
cudnn_benchmark = False | ||
|
||
monitor = dict( | ||
# feishu alert configs | ||
alert=dict( | ||
enable_feishu_alert=DO_ALERT, | ||
feishu_alert_address=None, # feishu webhook to send alert message | ||
light_monitor_address=None, # light_monitor address to send heartbeat | ||
), | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
JOB_NAME = "13b_train" | ||
DO_ALERT = False | ||
|
||
SEQ_LEN = 4096 | ||
HIDDEN_SIZE = 5120 | ||
NUM_ATTENTION_HEAD = 40 | ||
MLP_RATIO = 8 / 3 | ||
NUM_LAYER = 60 | ||
VOCAB_SIZE = 103168 | ||
|
||
MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx" | ||
# Ckpt folder format: | ||
# fs: 'local:/mnt/nfs/XXX' | ||
SAVE_CKPT_FOLDER = "local:llm_ckpts" | ||
LOAD_CKPT_FOLDER = "local:llm_ckpts/49" | ||
|
||
# boto3 Ckpt folder format: | ||
# import os | ||
# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint | ||
# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm" | ||
# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/" | ||
CHECKPOINT_EVERY = 50 | ||
ckpt = dict( | ||
enable_save_ckpt=False, # enable ckpt save. | ||
save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. | ||
# load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), | ||
load_ckpt_folder="local:llm_ckpts/", | ||
# 'load_ckpt_info' setting guide: | ||
# 1. the 'path' indicate ckpt path, | ||
# 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" | ||
# 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported. | ||
load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"), | ||
# 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering | ||
# training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) | ||
# with an automatic restart mechanism upon training reboot. | ||
# Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint | ||
# path specified in `load_ckpt_info` by default. | ||
# If you want to initialize your model weights from another model, you must set `auto_resume` to False. | ||
# If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. | ||
auto_resume=True, | ||
checkpoint_every=CHECKPOINT_EVERY, | ||
async_upload=True, # async ckpt upload. (only work for boto3 ckpt) | ||
async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. | ||
oss_snapshot_freq=int(CHECKPOINT_EVERY / 2), # snapshot ckpt save frequency. | ||
) | ||
|
||
TRAIN_FOLDER = "/path/to/dataset" | ||
VALID_FOLDER = "/path/to/dataset" | ||
data = dict( | ||
seq_len=SEQ_LEN, | ||
# micro_num means the number of micro_batch contained in one gradient update | ||
micro_num=4, | ||
# packed_length = micro_bsz * SEQ_LEN | ||
micro_bsz=4, | ||
# defaults to the value of micro_num | ||
valid_micro_num=4, | ||
# defaults to 0, means disable evaluate | ||
valid_every=50, | ||
pack_sample_into_one=False, | ||
total_steps=20, | ||
skip_batches="", | ||
rampup_batch_size="", | ||
# Datasets with less than 50 rows will be discarded | ||
min_length=50, | ||
# train_folder=TRAIN_FOLDER, | ||
# valid_folder=VALID_FOLDER, | ||
empty_cache_and_diag_interval=10, | ||
diag_outlier_ratio=1.1, | ||
) | ||
|
||
grad_scaler = dict( | ||
fp16=dict( | ||
# the initial loss scale, defaults to 2**16 | ||
initial_scale=2**16, | ||
# the minimum loss scale, defaults to None | ||
min_scale=1, | ||
# the number of steps to increase loss scale when no overflow occurs | ||
growth_interval=1000, | ||
), | ||
# the multiplication factor for increasing loss scale, defaults to 2 | ||
growth_factor=2, | ||
# the multiplication factor for decreasing loss scale, defaults to 0.5 | ||
backoff_factor=0.5, | ||
# the maximum loss scale, defaults to None | ||
max_scale=2**24, | ||
# the number of overflows before decreasing loss scale, defaults to 2 | ||
hysteresis=2, | ||
) | ||
|
||
hybrid_zero_optimizer = dict( | ||
# Enable low_level_optimzer overlap_communication | ||
overlap_sync_grad=True, | ||
overlap_sync_param=True, | ||
# bucket size for nccl communication params | ||
reduce_bucket_size=512 * 1024 * 1024, | ||
# grad clipping | ||
clip_grad_norm=1.0, | ||
) | ||
|
||
loss = dict( | ||
label_smoothing=0, | ||
) | ||
|
||
adam = dict( | ||
lr=1e-4, | ||
adam_beta1=0.9, | ||
adam_beta2=0.95, | ||
adam_beta2_c=0, | ||
adam_eps=1e-8, | ||
weight_decay=0.01, | ||
) | ||
|
||
lr_scheduler = dict( | ||
total_steps=data["total_steps"], | ||
init_steps=0, # optimizer_warmup_step | ||
warmup_ratio=0.01, | ||
eta_min=1e-5, | ||
last_epoch=-1, | ||
) | ||
|
||
beta2_scheduler = dict( | ||
init_beta2=adam["adam_beta2"], | ||
c=adam["adam_beta2_c"], | ||
cur_iter=-1, | ||
) | ||
|
||
model = dict( | ||
checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] | ||
num_attention_heads=NUM_ATTENTION_HEAD, | ||
embed_split_hidden=True, | ||
vocab_size=VOCAB_SIZE, | ||
embed_grad_scale=1, | ||
parallel_output=True, | ||
hidden_size=HIDDEN_SIZE, | ||
num_layers=NUM_LAYER, | ||
mlp_ratio=MLP_RATIO, | ||
apply_post_layer_norm=False, | ||
dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" | ||
norm_type="rmsnorm", | ||
layer_norm_epsilon=1e-5, | ||
use_flash_attn=True, | ||
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used. | ||
) | ||
""" | ||
zero1 parallel (dict): | ||
1. size: int | ||
* if size <= 0, the size of the zero process group is equal to the size of the dp process group, | ||
so parameters will be divided within the range of dp. | ||
* if size == 1, zero is not used, and all dp groups retain the full amount of model parameters. | ||
* if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size. | ||
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8. | ||
2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False. | ||
tensor parallel (dict): | ||
1. size: int, the size of tensor parallel. | ||
2. mode: str, the mode should be 'origin_tp' or 'fstp', defaults to 'origin_tp'. If the mode is 'fstp', | ||
the sequence_parallel should be True. | ||
pipeline parallel (dict): | ||
1. size: int, the size of pipeline parallel. | ||
2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler, | ||
defaults to False. | ||
sequence parallel (bool): enable/disable sequence parallel, defaults to False. | ||
""" | ||
parallel = dict( | ||
zero1=dict(size=-1, fsdp=False), | ||
tensor=dict(size=8, mode="fstp"), | ||
pipeline=dict(size=1, interleaved_overlap=True), | ||
sequence_parallel=True, | ||
) | ||
|
||
cudnn_deterministic = False | ||
cudnn_benchmark = False | ||
|
||
monitor = dict( | ||
# feishu alert configs | ||
alert=dict( | ||
enable_feishu_alert=DO_ALERT, | ||
feishu_alert_address=None, # feishu webhook to send alert message | ||
light_monitor_address=None, # light_monitor address to send heartbeat | ||
), | ||
) |
Oops, something went wrong.