forked from LeonGuertler/SuperTinyLanguageModels
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgpt2_qwen_training.yaml
81 lines (81 loc) · 1.66 KB
/
gpt2_qwen_training.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
model:
model_string: "Qwen/Qwen2-0.5B"
core_model:
core_model_type: hf_core
embedder:
tokenizer_type: gpt2
embedding_model_type: generic
dataset_name: stlm
lm_head:
normalization: rms_norm
bias: false
lm_head_type: generic
hidden_dim: 896
context_window: 512
vocab_size: 50257
model_shell_type: standard
embedding_weight_tying: true
positional_encoding_type: rope
trainer:
dropout_scheduler:
dropout_type: constant
dropout: 0.1
dataset: openwebtext
training:
trainer_type: base_trainer
batch_size: 6
gradient_accumulation_steps: 20
max_iters: 50000
lr_decay_iters: 50000
warmup_iters: 5000
eval_interval: 2000
log_interval: 10
eval_iters: 500
checkpoint_interval: 1000000000.0
run_profiler: false
eval:
- evaluator: "ft_qa"
benchmarks:
- "winograd"
- "hellaswag"
- "arc"
- "mmlu"
- "blimp"
max_train_samples: 1000
max_eval_samples: 1000
- evaluator: "glue"
- benchmarks:
- "winograd"
- "hellaswag"
- "arc"
- "mmlu"
- "blimp"
num_samples: 1000
evaluator: "mcq"
- evaluator: "prog"
optimizer:
name: nanoGPTadamW
lr: 0.0006
min_lr: 6.0e-05
weight_decay: 0.1
beta1: 0.9
beta2: 0.95
grad_clip: 1.0
decay_lr: true
warmup_iters: 5000
lr_scheduler:
name: cosine
dataloader:
name: standard
loss_fn:
name: cross_entropy
general:
logging:
wandb_log: False
wandb_project: SuperTinyLanguageModels
paths:
output_dir: outputs
data_dir: data
checkpoint_dir: checkpoints
seed: 489
device: cuda