fix(conflicts): resolve conflicts from merging feat/fstp-refactor

yingtongxiong · Oct 25, 2023 · ab97724 · ab97724
2 parents 7250453 + cc20fa2
commit ab97724
Show file tree

Hide file tree

Showing 39 changed files with 1,438 additions and 128 deletions.
diff --git a/.github/workflows/unit_tests.yaml b/.github/workflows/unit_tests.yaml
@@ -0,0 +1,76 @@
+name: unit-tests
+on:
+  push:
+    branches:
+      - "develop"
+      - "main"
+    paths-ignore:
+      - "cmds/**"
+      - "**.md"
+  pull_request:
+    branches:
+      - "develop"
+      - "main"
+    paths-ignore:
+      - "cmds/**"
+      - "**.md"
+env:
+  WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4)
+  SLURM_PARTITION: llm_t
+
+jobs:
+  check-requirements:
+    runs-on: [t_cluster]
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+    - uses: actions/checkout@v3
+      with:
+         fetch-depth: 2
+    - name: check-requirements
+      run: |
+        changed_files=$(git diff --name-only -r HEAD^1 HEAD)
+        echo $changed_files
+        if [[ $changed_files =~ "runtime.txt" ]]; then
+          pip install -r requirements/runtime.txt
+        fi
+
+        if [[ $changed_files =~ "torch.txt"  ]]; then
+          pip install -r requirements/torch.txt
+        fi
+
+
+  unit_tests_core_pipeline:
+    if: ${{ always() }}
+    needs: check-requirements
+    runs-on: [t_cluster]
+    timeout-minutes: 20
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+    - uses: actions/checkout@v3
+
+    - name: core_pipeline
+      run: |
+        source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
+        export PYTHONPATH=$PWD:$PYTHONPATH
+        srun -p ${SLURM_PARTITION} --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_core/test_pipeline.py
+
+  unit_tests_utils_storage_manager:
+    if: ${{ always() }}
+    needs: check-requirements
+    runs-on: [t_cluster]
+    timeout-minutes: 20
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+    - uses: actions/checkout@v3
+
+    - name: utils_storage_manager
+      run: |
+        source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
+        export PYTHONPATH=$PWD:$PYTHONPATH
+        srun -p ${SLURM_PARTITION} --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_utils/test_storage_manager.py
diff --git a/.github/workflows/weekly_test.yaml b/.github/workflows/weekly_test.yaml
@@ -3,6 +3,7 @@ on:
   push:
     branches:
       - "main"
+      - "develop"
 env:
   SLURM_PARTITION: llm_s
 

diff --git a/.gitignore b/.gitignore
@@ -149,5 +149,9 @@ memory_trace
 13b_train*/
 30b_train*/
 fstp_logs/
+configs/7B_train/*
+configs/13B_train/*
+configs/30B_train/*
+
 atb
 pip
diff --git a/configs/13B_template.py b/configs/13B_template.py
@@ -1,7 +1,7 @@
 DO_ALERT = False
 
-SEQ_LEN = 4096
-JOB_NAME = "13b_train_" + str({micro_bsz}) + "_" + str({sp}) + "_" + str({checkpoint})
+SEQ_LEN = {seq_len}
+JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint})
 HIDDEN_SIZE = 5120
 NUM_ATTENTION_HEAD = 40
 MLP_RATIO = 8 / 3

diff --git a/configs/30B_template.py b/configs/30B_template.py
@@ -1,7 +1,7 @@
 DO_ALERT = False
 
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str({micro_bsz}) + "_" + str({sp}) + "_" + str({checkpoint})
+SEQ_LEN = {seq_len}
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint})
 HIDDEN_SIZE = 6144
 NUM_ATTENTION_HEAD = 48
 MLP_RATIO = 8 / 3

diff --git a/configs/7B_MoE4_sft.py b/configs/7B_MoE4_sft.py
@@ -4,7 +4,7 @@
 SEQ_LEN = 2048
 HIDDEN_SIZE = 4096
 NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
+MLP_RATIO = 4 / 3
 NUM_LAYER = 32
 VOCAB_SIZE = 103168
 
@@ -30,6 +30,14 @@
     # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
     # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
     load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=True,
     checkpoint_every=CHECKPOINT_EVERY,
     async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
     async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
@@ -43,7 +51,7 @@
     # micro_num means the number of micro_batch contained in one gradient update
     micro_num=4,
     # packed_length = micro_bsz * SEQ_LEN
-    micro_bsz=1,
+    micro_bsz=2,
     # defaults to the value of micro_num
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
@@ -81,8 +89,8 @@
 
 hybrid_zero_optimizer = dict(
     # Enable low_level_optimzer overlap_communication
-    overlap_sync_grad=True,
-    overlap_sync_param=True,
+    overlap_sync_grad=False,
+    overlap_sync_param=False,
     # bucket size for nccl communication params
     reduce_bucket_size=512 * 1024 * 1024,
     # grad clipping
@@ -133,7 +141,7 @@
     layer_norm_epsilon=1e-5,
     use_flash_attn=True,
     num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-    num_experts=4,
+    num_experts=8,
     moe_use_residual=False,
     moe_gate_k=2,
 )
@@ -150,8 +158,8 @@
 tensor parallel: tensor parallel size, usually the number of GPUs per node.
 """
 parallel = dict(
-    zero1=-1,
-    tensor=2,
+    zero1=dict(size=-1, fsdp=False),
+    tensor=1,
     pipeline=dict(size=1, interleaved_overlap=True),
     sequence_parallel=False,
 )

diff --git a/configs/7B_template.py b/configs/7B_template.py
@@ -1,8 +1,8 @@
 # JOB_NAME = "7b_train"
 DO_ALERT = False
 
-SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str({micro_bsz}) + "_" + str({sp}) + "_" + str({checkpoint})
+SEQ_LEN = {seq_len}
+JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint})
 HIDDEN_SIZE = 4096
 NUM_ATTENTION_HEAD = 32
 MLP_RATIO = 8 / 3

diff --git a/configs/generate.py b/configs/generate.py
@@ -5,9 +5,9 @@
 name = "./configs/"
 root_names = ["7B_train_", "13B_train_", "30B_train_"]
 model_size = ["7B", "13B", "30B"]
-micro_bsz = [1, 2, 4, 8, 16, 32, 64]
-sp = ["none", "megatron", "flash-attn", "intern"]
-intern_overlap = [False, False, False, True]
+seq_length = [4096, 8192, 16384, 32768, 65536, 131072, 262144]
+sp = ["none", "megatron", "flash-attn", "intern", "intern"]
+intern_overlap = [False, False, False, True, False]
 checkpoint = [False, True]
 
 for idx, root_name in enumerate(root_names):
@@ -31,13 +31,22 @@
                     line = line.replace("{sp}", f'"{sp_mode}"')
                     line = line.replace("{intern_overlap}", str(intern_overlap[i]))
                     line = line.replace("{checkpoint}", str(ckpt))
-                    output_file_name = str(mb) + "_" + str(sp_mode) + "_ckpt_" + str(ckpt) + ".py"
+                    output_file_name = (
+                        str(seq)
+                        + "_"
+                        + str(sp_mode)
+                        + "_overlap_"
+                        + str(intern_overlap[i])
+                        + "_ckpt_"
+                        + str(ckpt)
+                        + ".py"
+                    )
                     write_file = folder_path + "/" + output_file_name
                     with open(write_file, "w") as file:
                         file.write(line)
 
                     log_name = root_name + "_" + output_file_name[:-3]
 
-                    command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=10 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log"
+                    command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=30 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log"
                     process = subprocess.Popen(command, shell=True, executable="/bin/bash")
                     process.wait()
diff --git a/doc/code-docs/locales/en/LC_MESSAGES/index.po b/doc/code-docs/locales/en/LC_MESSAGES/index.po
@@ -7,7 +7,7 @@ msgid ""
 msgstr ""
 "Project-Id-Version: InternLM \n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2023-09-07 10:56+0800\n"
+"POT-Creation-Date: 2023-10-10 17:48+0800\n"
 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
 "Language: en\n"
@@ -16,7 +16,7 @@ msgstr ""
 "MIME-Version: 1.0\n"
 "Content-Type: text/plain; charset=utf-8\n"
 "Content-Transfer-Encoding: 8bit\n"
-"Generated-By: Babel 2.12.1\n"
+"Generated-By: Babel 2.13.0\n"
 
 #: ../../source/index.rst:8 11e029810acf410180311a3c63eb01f4
 msgid "InternLM"
@@ -46,38 +46,42 @@ msgstr "Parallel Training"
 msgid "混合精度"
 msgstr "Mixed Precision"
 
-#: ../../source/index.rst:59 9234725f3c464731993d73607608c874
+#: ../../source/index.rst:59
+msgid "混合专家模型"
+msgstr "Mixture-of-Experts"
+
+#: ../../source/index.rst:67 9234725f3c464731993d73607608c874
 msgid "模型备份"
 msgstr "Model Checkpointing"
 
-#: ../../source/index.rst:67 8e4ce037017f4510b2892a66003877fa
+#: ../../source/index.rst:75 8e4ce037017f4510b2892a66003877fa
 msgid "性能分析"
 msgstr "Profiler"
 
-#: ../../source/index.rst:75 a36e02819ecd4b448a8cb4ebbecb6600
+#: ../../source/index.rst:83 a36e02819ecd4b448a8cb4ebbecb6600
 msgid "训练监控"
 msgstr "Monitor"
 
-#: ../../source/index.rst:83 b912e292486f455c8b5cdd75962e8ac2
+#: ../../source/index.rst:91 b912e292486f455c8b5cdd75962e8ac2
 msgid "训练样例"
 msgstr "Example"
 
-#: ../../source/index.rst:91 ea9e9281720941a1830e5df7a2badf7a
+#: ../../source/index.rst:99 ea9e9281720941a1830e5df7a2badf7a
 msgid "常见问题"
 msgstr "Q&A"
 
-#: ../../source/index.rst:99 e08edc5aa1c74965b10084b393b88fae
+#: ../../source/index.rst:107 e08edc5aa1c74965b10084b393b88fae
 msgid "索引和表格"
 msgstr "Indices and tables"
 
-#: ../../source/index.rst:101 f3fdca059caa49dcad09aa44be7f02d6
+#: ../../source/index.rst:109 f3fdca059caa49dcad09aa44be7f02d6
 msgid ":ref:`genindex`"
 msgstr ""
 
-#: ../../source/index.rst:102 b3791e811315435097bb507edc3f4b9b
+#: ../../source/index.rst:110 b3791e811315435097bb507edc3f4b9b
 msgid ":ref:`modindex`"
 msgstr ""
 
-#: ../../source/index.rst:103 a164b772960f4ab8b18c7e8820f69f55
+#: ../../source/index.rst:111 a164b772960f4ab8b18c7e8820f69f55
 msgid ":ref:`search`"
 msgstr ""
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,6 +3,7 @@ on: @@
       push:
         branches:
           - "main"
+          - "develop"
     env:
       SLURM_PARTITION: llm_s
@@ Expand Down @@