forked from LeonGuertler/SuperTinyLanguageModels
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbaseline.yaml
85 lines (85 loc) · 1.7 KB
/
baseline.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
model:
core_model:
core_model_type: generic
num_layers: 8
ffn:
ffn_type: swiglu
ffn_dim: 1320
normalization: rms_norm
bias: false
attn:
attn_type: generic
num_heads: 16
normalization: rms_norm
group_size: 4
bias: false
is_causal: true
embedder:
tokenizer_type: gpt2
embedding_model_type: generic
dataset_name: stlm
lm_head:
normalization: rms_norm
bias: false
lm_head_type: generic
hidden_dim: 512
context_window: 512
vocab_size: 50257
model_shell_type: standard
embedding_weight_tying: true
positional_encoding_type: rope
trainer:
dropout_scheduler:
dropout_type: constant
dropout: 0.1
dataset: openwebtext
training:
trainer_type: base_trainer
batch_size: 24
gradient_accumulation_steps: 20
max_iters: 30000
lr_decay_iters: 30000
warmup_iters: 5000
eval_interval: 2000
log_interval: 10
eval_iters: 500
checkpoint_interval: 1000000000.0
run_profiler: false
eval:
- benchmarks:
- "winograd"
- "hellaswag"
- "arc"
- "mmlu"
- "blimp"
num_samples: 1000
evaluator: "mcq"
- evaluator: "prog"
optimizer:
name: nanoGPTadamW
lr: 0.0006
min_lr: 6.0e-05
weight_decay: 0.1
beta1: 0.9
beta2: 0.95
grad_clip: 1.0
decay_lr: true
warmup_iters: 5000
lr_scheduler:
name: cosine
dataloader:
name: standard
datasampling:
name: standard
loss_fn:
name: cross_entropy
general:
logging:
wandb_log: false
wandb_project: SuperTinyLanguageModels
paths:
output_dir: outputs
data_dir: data
checkpoint_dir: checkpoints
seed: 489
device: cuda