forked from LeonGuertler/SuperTinyLanguageModels
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbaseline_next_thought.yaml
114 lines (104 loc) · 2.24 KB
/
baseline_next_thought.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
model:
core_model:
core_model_type: next_thought_baseline
embedder:
tokenizer_type: gpt2
embedding_model_type: hierarchical
dataset_name: simple_en_wiki
pooling_layers: 5
pooling_dims: [768, 1920, 1920, 1920, 4800]
pooling_pct_per_layer: [0.3, 0.5, 0.6, 0.6]
num_heads: 12
context_window: 512
standard_ffn_block:
ffn_type: swiglu
ffn_dim: 1536
normalization: rms_norm
bias: false
standard_attn_block:
attn_type: generic
num_heads: 16
normalization: rms_norm
group_size: 4
bias: false
is_causal: false
lm_head:
lm_head_type: latent_2_seq
latent_decoded_into: 16
num_layers: 4
standard_ffn_block:
ffn_type: swiglu
ffn_dim: 1536
normalization: rms_norm
bias: false
standard_attn_block:
attn_type: generic
num_heads: 16
normalization: rms_norm
group_size: 4
bias: false
is_causal: true
latent_dim: 4800
embedding_dim: 768
hidden_dim: 768
context_window: 512
vocab_size: 50257
model_shell_type: standard
embedding_weight_tying: false
positional_encoding_type: learned
trainer:
dropout_scheduler:
dropout_type: constant
dropout: 0.1
start_dropout_p: 0.0
end_dropout_p: 0.1
start_iter: 0
end_iter: 10000
dataset: openhermes-2.5
training:
trainer_type: base_trainer
batch_size: 24
gradient_accumulation_steps: 20
max_iters: 25000
lr_decay_iters: 25000
warmup_iters: 5000
eval_interval: 2000
log_interval: 10
eval_iters: 500
checkpoint_interval: 1000000000.0
run_profiler: false
optimizer:
name: nanoGPTadamW
lr: 0.0018
min_lr: 6.0e-05
weight_decay: 0.1
beta1: 0.9
beta2: 0.95
grad_clip: 1.0
decay_lr: true
warmup_iters: 5000
lr_scheduler:
name: cosine
dataloader:
name: conversational
loss_fn:
name: cross_entropy
eval:
benchmarks:
- "winograd"
- "hellaswag"
- "arc"
- "mmlu"
- "blimp"
num_samples: 5000
evaluator: "mcq"
general:
logging:
wandb_log: true
wandb_project: SuperTinyLanguageModels
paths:
output_dir: outputs
data_dir: data
checkpoint_dir: checkpoints
seed: 489
device: cuda