Merge branch 'develop' of https://github.com/PaddlePaddle/PaddleNLP i…

…nto optimize_qwen
DesmonDay · Oct 24, 2023 · 0512194 · 0512194
2 parents c8d7869 + 60fd6be
commit 0512194
Show file tree

Hide file tree

Showing 20 changed files with 1,018 additions and 115 deletions.
diff --git a/llm/README.md b/llm/README.md
@@ -11,7 +11,7 @@
 | [GPT-3](./gpt-3) |   ✅  |  ✅  |  ✅  |  🚧  | ✅   | 🚧 |
 | [OPT](./opt) | 🚧 | ✅ | ✅ | 🚧 |  ✅ | 🚧 |
 | [GLM](./glm) | ❌  | ✅ | ✅ | 🚧 |  ✅ | 🚧 |
-| [Qwen](./qwen) | ❌ | ✅ | ✅ | ✅ |  ✅ | 🚧 |
+| [Qwen](./qwen) | ✅ | ✅ | ✅ | ✅ |  ✅ | 🚧 |
 
 
 * ✅: Supported
@@ -39,6 +39,11 @@
 
 ## 2. 预训练
 [LLaMA v1/v2](./llama)、[GPT-3](./gpt-3) 目录中提供了模型预训练的数据准备和训练细节，后续我们将支持更多的模型预训练。
+```
+# 千问模型预训练
+python -u  -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" run_pretrain.py ./qwen/pretrain_argument_stage2.json
+
+```
 
 ## 3. 精调
 目前精调统一脚本只支持[LLaMA v1/v2](./llama)、[ChatGLM-6B](./chatglm)、[ChatGLM2-6B](./chatglm2)、[Bloom](./bloom)、[OPT](./opt)、[Qwen](./qwen)，其他模型精调使用详见对应模型目录。接下来我们将以**Llama 2**为例介绍如何使用统一脚本进行SFT、LoRA、Prefix Tuning。更多LoRA、Prefix Tuning请参见[PEFT文档](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/docs/peft.md)。

diff --git a/llm/ernie-3.5-se/run_pretrain.py b/llm/ernie-3.5-se/run_pretrain.py
@@ -397,7 +397,7 @@ def main():
             use_progressive_seq_len=True,
         )
     else:
-        model = model_class._from_config(config, dtype=dtype)
+        model = model_class.from_config(config, dtype=dtype)
 
     # Create the learning_rate sheduler and optimizer
     if training_args.decay_steps is None:

diff --git a/llm/gpt-3/run_pretrain.py b/llm/gpt-3/run_pretrain.py
@@ -411,7 +411,7 @@ def main():
             dtype=dtype,
         )
     else:
-        model = model_class._from_config(config, dtype=dtype)
+        model = model_class.from_config(config, dtype=dtype)
 
     # Create the learning_rate sheduler and optimizer
     if training_args.decay_steps is None:

diff --git a/llm/qwen/pretrain_argument_stage2.json b/llm/qwen/pretrain_argument_stage2.json
@@ -0,0 +1,39 @@
+{
+    "model_name_or_path": "qwen/qwen-7b",
+    "tokenizer_name_or_path": "qwen/qwen-7b",
+    "input_dir": "./data",
+    "output_dir": "./checkpoints/qwen_pretrain_ckpts",
+    "per_device_train_batch_size": 2,
+    "gradient_accumulation_steps": 1,
+    "per_device_eval_batch_size": 2,
+    "tensor_parallel_degree": 1,
+    "pipeline_parallel_degree": 1,
+    "sharding": "stage2",
+    "virtual_pp_degree": 1,
+    "sequence_parallel": 0,   
+    "use_flash_attention": true,
+    "use_fused_rms_norm": true,
+    "max_seq_length": 4096,
+    "learning_rate": 3e-05,
+    "min_learning_rate": 3e-06,
+    "warmup_steps": 30,
+    "logging_steps": 1,
+    "max_steps": 10000,
+    "save_steps": 5000,
+    "eval_steps": 1000,
+    "weight_decay": 0.01,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "warmup_ratio": 0.01,
+    "max_grad_norm": 1.0,
+    "dataloader_num_workers": 1,
+    "continue_training": 1,
+    "do_train": true,
+    "do_eval": true,
+    "do_predict": true,
+    "disable_tqdm": true,
+    "recompute": true,
+    "distributed_dataloader": 1,
+    "recompute_granularity": "full",
+    "save_total_limit": 2
+  }
diff --git a/llm/qwen/pretrain_argument_tp2pp4.json b/llm/qwen/pretrain_argument_tp2pp4.json
@@ -0,0 +1,39 @@
+{
+    "model_name_or_path": "qwen/qwen-7b",
+    "tokenizer_name_or_path": "qwen/qwen-7b",
+    "input_dir": "./data",
+    "output_dir": "./checkpoints/qwen_pretrain_ckpts",
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 16,
+    "per_device_eval_batch_size": 16,
+    "tensor_parallel_degree": 2,
+    "pipeline_parallel_degree": 4,
+    "sharding": "stage1",
+    "virtual_pp_degree": 1,
+    "sequence_parallel": 0,   
+    "use_flash_attention": true,
+    "use_fused_rms_norm": true,
+    "max_seq_length": 4096,
+    "learning_rate": 3e-05,
+    "min_learning_rate": 3e-06,
+    "warmup_steps": 30,
+    "logging_steps": 1,
+    "max_steps": 10000,
+    "save_steps": 5000,
+    "eval_steps": 1000,
+    "weight_decay": 0.01,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "warmup_ratio": 0.01,
+    "max_grad_norm": 1.0,
+    "dataloader_num_workers": 1,
+    "continue_training": 1,
+    "do_train": true,
+    "do_eval": true,
+    "do_predict": true,
+    "disable_tqdm": true,
+    "recompute": true,
+    "distributed_dataloader": 1,
+    "recompute_granularity": "full",
+    "save_total_limit": 2
+  }