forked from LeonGuertler/SuperTinyLanguageModels
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbyte_level_ffn_sharing.yaml
92 lines (90 loc) · 1.86 KB
/
byte_level_ffn_sharing.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
model:
core_model:
core_model_type: generic_ffn_sharing
num_layers: 16
ffn:
ffn_type: swiglu
ffn_dim: 1536
normalization: rms_norm
bias: false
attn:
attn_type: generic
num_heads: 16
normalization: rms_norm
group_size: 4
bias: false
is_causal: true
embedder:
tokenizer_type: "gpt2"
byte_tokenizer_type: "bpe"
embedding_model_type: byte_level
byte_context_window: 12
dataset_name: simple_en_wiki
lm_head:
normalization: rms_norm
bias: false
lm_head_type: byte_level
hidden_dim: 512
context_window: 512
vocab_size: 50257
byte_vocab_size: 258
byte_context_window: 12
byte_embedding_dim: 128
model_shell_type: standard
embedding_weight_tying: false
positional_encoding_type: rope
trainer:
dropout_scheduler:
dropout_type: linear
start_dropout_p: 0.0
end_dropout_p: 0.1
start_iter: 0
end_iter: 10000
dataset: simple_en_wiki
training:
trainer_type: base_trainer
batch_size: 24
gradient_accumulation_steps: 20
max_iters: 25000
lr_decay_iters: 25000
warmup_iters: 5000
eval_interval: 5000
log_interval: 100
eval_iters: 500
checkpoint_interval: 1000000000.0
run_profiler: false
optimizer:
name: nanoGPTadamW
lr: 0.0006
min_lr: 6.0e-05
weight_decay: 0.1
beta1: 0.9
beta2: 0.95
grad_clip: 1.0
decay_lr: true
warmup_iters: 5000
lr_scheduler:
name: cosine
dataloader:
name: byte_pooling
loss_fn:
name: cross_entropy
eval:
benchmarks:
- "winograd"
- "hellaswag"
- "arc"
- "mmlu"
- "blimp"
num_samples: 5000
evaluator: "mcq"
general:
logging:
wandb_log: true
wandb_project: SuperTinyLanguageModels
paths:
output_dir: outputs
data_dir: data
checkpoint_dir: checkpoints
seed: 489
device: cuda