-
Notifications
You must be signed in to change notification settings - Fork 1.1k
/
Copy pathtinystories.yaml
135 lines (98 loc) · 4.37 KB
/
tinystories.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
# ``model_config``. (type: Optional[str], default: null)
model_name: stories15M
# A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
# ``model_config``. (type: Optional[Config], default: null)
model_config:
name: stories15M
hf_config: {}
scale_embeddings: false
block_size: 256
padded_vocab_size: 32000
n_layer: 6
n_head: 6
n_query_groups: 6
n_embd: 288
head_size: 48
rotary_percentage: 1.0
parallel_residual: false
bias: false
norm_class_name: RMSNorm
mlp_class_name: LLaMAMLP
intermediate_size: 768
# Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
# /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
out_dir: out/pretrain/stories15M
# The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
precision: bf16-mixed
# Optional path to a checkpoint directory to initialize the model from.
# Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
initial_checkpoint_dir:
# Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
# from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
# ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
# (type: Union[bool, Literal["auto"], Path], default: False)
resume: false
# Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
data: TinyStories
# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
train:
# Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
save_interval: 1000
# Number of iterations between logging calls (type: int, default: 1)
log_interval: 1
# Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
global_batch_size: 512
# Number of samples per data-parallel rank (type: int, default: 4)
micro_batch_size: 128
# Number of iterations with learning rate warmup active (type: int, default: 2000)
lr_warmup_steps: 1000
# Number of epochs to train on (type: Optional[int], default: null)
epochs:
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
max_tokens: 9700000000 # original did 298,000 iters
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
max_steps:
# Limits the length of samples. Off by default (type: Optional[int], default: null)
max_seq_length: 256
# Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
tie_embeddings: true
# (type: Optional[float], default: 1.0)
max_norm: 1.0
# (type: float, default: 4e-05)
min_lr: 0.0
# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
eval:
# Number of optimizer steps between evaluation calls (type: int, default: 1000)
interval: 2000
# Number of tokens to generate (type: Optional[int], default: null)
max_new_tokens:
# Number of iterations (type: int, default: 100)
max_iters: 100
# Whether to evaluate on the validation set at the beginning of the training
initial_validation: false
# Whether to evaluate on the validation set at the end the training
final_validation: false
# Optimizer-related arguments
optimizer:
class_path: torch.optim.AdamW
init_args:
# (type: float, default: 0.001)
lr: 0.0005
# (type: float, default: 0.01)
weight_decay: 0.1
# (type: tuple, default: (0.9,0.999))
betas:
- 0.9
- 0.95
# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
devices: auto
# How many nodes to use. (type: int, default: 1)
num_nodes: 1
# Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
# module require this. (type: Optional[Path], default: null)
tokenizer_dir: checkpoints/meta-llama/Llama-2-7b-hf
# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
logger_name: csv
# The random seed to use for reproducibility. (type: int, default: 42)
seed: 42