Skip to content

Commit

Permalink
fix(conflicts): resolve conflicts from merging feat/fstp-refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
huangting4201 committed Oct 25, 2023
2 parents 7250453 + cc20fa2 commit ab97724
Show file tree
Hide file tree
Showing 39 changed files with 1,438 additions and 128 deletions.
76 changes: 76 additions & 0 deletions .github/workflows/unit_tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
name: unit-tests
on:
push:
branches:
- "develop"
- "main"
paths-ignore:
- "cmds/**"
- "**.md"
pull_request:
branches:
- "develop"
- "main"
paths-ignore:
- "cmds/**"
- "**.md"
env:
WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4)
SLURM_PARTITION: llm_t

jobs:
check-requirements:
runs-on: [t_cluster]
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
- uses: actions/checkout@v3
with:
fetch-depth: 2
- name: check-requirements
run: |
changed_files=$(git diff --name-only -r HEAD^1 HEAD)
echo $changed_files
if [[ $changed_files =~ "runtime.txt" ]]; then
pip install -r requirements/runtime.txt
fi
if [[ $changed_files =~ "torch.txt" ]]; then
pip install -r requirements/torch.txt
fi
unit_tests_core_pipeline:
if: ${{ always() }}
needs: check-requirements
runs-on: [t_cluster]
timeout-minutes: 20
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
- uses: actions/checkout@v3

- name: core_pipeline
run: |
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
export PYTHONPATH=$PWD:$PYTHONPATH
srun -p ${SLURM_PARTITION} --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_core/test_pipeline.py
unit_tests_utils_storage_manager:
if: ${{ always() }}
needs: check-requirements
runs-on: [t_cluster]
timeout-minutes: 20
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
- uses: actions/checkout@v3

- name: utils_storage_manager
run: |
source /mnt/petrelfs/share_data/llm_env/env/llm-flash2.0
export PYTHONPATH=$PWD:$PYTHONPATH
srun -p ${SLURM_PARTITION} --job-name=internlm-ut-${GITHUB_RUN_ID}-${GITHUB_JOB} --quotatype=spot -N 1 -n 1 --gres=gpu:8 python -m pytest -s -v ./tests/test_utils/test_storage_manager.py
1 change: 1 addition & 0 deletions .github/workflows/weekly_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ on:
push:
branches:
- "main"
- "develop"
env:
SLURM_PARTITION: llm_s

Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -149,5 +149,9 @@ memory_trace
13b_train*/
30b_train*/
fstp_logs/
configs/7B_train/*
configs/13B_train/*
configs/30B_train/*

atb
pip
4 changes: 2 additions & 2 deletions configs/13B_template.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
DO_ALERT = False

SEQ_LEN = 4096
JOB_NAME = "13b_train_" + str({micro_bsz}) + "_" + str({sp}) + "_" + str({checkpoint})
SEQ_LEN = {seq_len}
JOB_NAME = "13b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint})
HIDDEN_SIZE = 5120
NUM_ATTENTION_HEAD = 40
MLP_RATIO = 8 / 3
Expand Down
4 changes: 2 additions & 2 deletions configs/30B_template.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
DO_ALERT = False

SEQ_LEN = 4096
JOB_NAME = "7b_train_" + str({micro_bsz}) + "_" + str({sp}) + "_" + str({checkpoint})
SEQ_LEN = {seq_len}
JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint})
HIDDEN_SIZE = 6144
NUM_ATTENTION_HEAD = 48
MLP_RATIO = 8 / 3
Expand Down
22 changes: 15 additions & 7 deletions configs/7B_MoE4_sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
SEQ_LEN = 2048
HIDDEN_SIZE = 4096
NUM_ATTENTION_HEAD = 32
MLP_RATIO = 8 / 3
MLP_RATIO = 4 / 3
NUM_LAYER = 32
VOCAB_SIZE = 103168

Expand All @@ -30,6 +30,14 @@
# 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
# 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
# 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
# training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
# with an automatic restart mechanism upon training reboot.
# Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
# path specified in `load_ckpt_info` by default.
# If you want to initialize your model weights from another model, you must set `auto_resume` to False.
# If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
auto_resume=True,
checkpoint_every=CHECKPOINT_EVERY,
async_upload=True, # async ckpt upload. (only work for boto3 ckpt)
async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload.
Expand All @@ -43,7 +51,7 @@
# micro_num means the number of micro_batch contained in one gradient update
micro_num=4,
# packed_length = micro_bsz * SEQ_LEN
micro_bsz=1,
micro_bsz=2,
# defaults to the value of micro_num
valid_micro_num=4,
# defaults to 0, means disable evaluate
Expand Down Expand Up @@ -81,8 +89,8 @@

hybrid_zero_optimizer = dict(
# Enable low_level_optimzer overlap_communication
overlap_sync_grad=True,
overlap_sync_param=True,
overlap_sync_grad=False,
overlap_sync_param=False,
# bucket size for nccl communication params
reduce_bucket_size=512 * 1024 * 1024,
# grad clipping
Expand Down Expand Up @@ -133,7 +141,7 @@
layer_norm_epsilon=1e-5,
use_flash_attn=True,
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
num_experts=4,
num_experts=8,
moe_use_residual=False,
moe_gate_k=2,
)
Expand All @@ -150,8 +158,8 @@
tensor parallel: tensor parallel size, usually the number of GPUs per node.
"""
parallel = dict(
zero1=-1,
tensor=2,
zero1=dict(size=-1, fsdp=False),
tensor=1,
pipeline=dict(size=1, interleaved_overlap=True),
sequence_parallel=False,
)
Expand Down
4 changes: 2 additions & 2 deletions configs/7B_template.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# JOB_NAME = "7b_train"
DO_ALERT = False

SEQ_LEN = 4096
JOB_NAME = "7b_train_" + str({micro_bsz}) + "_" + str({sp}) + "_" + str({checkpoint})
SEQ_LEN = {seq_len}
JOB_NAME = "7b_train_" + str(SEQ_LEN) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint})
HIDDEN_SIZE = 4096
NUM_ATTENTION_HEAD = 32
MLP_RATIO = 8 / 3
Expand Down
19 changes: 14 additions & 5 deletions configs/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
name = "./configs/"
root_names = ["7B_train_", "13B_train_", "30B_train_"]
model_size = ["7B", "13B", "30B"]
micro_bsz = [1, 2, 4, 8, 16, 32, 64]
sp = ["none", "megatron", "flash-attn", "intern"]
intern_overlap = [False, False, False, True]
seq_length = [4096, 8192, 16384, 32768, 65536, 131072, 262144]
sp = ["none", "megatron", "flash-attn", "intern", "intern"]
intern_overlap = [False, False, False, True, False]
checkpoint = [False, True]

for idx, root_name in enumerate(root_names):
Expand All @@ -31,13 +31,22 @@
line = line.replace("{sp}", f'"{sp_mode}"')
line = line.replace("{intern_overlap}", str(intern_overlap[i]))
line = line.replace("{checkpoint}", str(ckpt))
output_file_name = str(mb) + "_" + str(sp_mode) + "_ckpt_" + str(ckpt) + ".py"
output_file_name = (
str(seq)
+ "_"
+ str(sp_mode)
+ "_overlap_"
+ str(intern_overlap[i])
+ "_ckpt_"
+ str(ckpt)
+ ".py"
)
write_file = folder_path + "/" + output_file_name
with open(write_file, "w") as file:
file.write(line)

log_name = root_name + "_" + output_file_name[:-3]

command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=10 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log"
command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=30 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log"
process = subprocess.Popen(command, shell=True, executable="/bin/bash")
process.wait()
26 changes: 15 additions & 11 deletions doc/code-docs/locales/en/LC_MESSAGES/index.po
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ msgid ""
msgstr ""
"Project-Id-Version: InternLM \n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2023-09-07 10:56+0800\n"
"POT-Creation-Date: 2023-10-10 17:48+0800\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language: en\n"
Expand All @@ -16,7 +16,7 @@ msgstr ""
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=utf-8\n"
"Content-Transfer-Encoding: 8bit\n"
"Generated-By: Babel 2.12.1\n"
"Generated-By: Babel 2.13.0\n"

#: ../../source/index.rst:8 11e029810acf410180311a3c63eb01f4
msgid "InternLM"
Expand Down Expand Up @@ -46,38 +46,42 @@ msgstr "Parallel Training"
msgid "混合精度"
msgstr "Mixed Precision"

#: ../../source/index.rst:59 9234725f3c464731993d73607608c874
#: ../../source/index.rst:59
msgid "混合专家模型"
msgstr "Mixture-of-Experts"

#: ../../source/index.rst:67 9234725f3c464731993d73607608c874
msgid "模型备份"
msgstr "Model Checkpointing"

#: ../../source/index.rst:67 8e4ce037017f4510b2892a66003877fa
#: ../../source/index.rst:75 8e4ce037017f4510b2892a66003877fa
msgid "性能分析"
msgstr "Profiler"

#: ../../source/index.rst:75 a36e02819ecd4b448a8cb4ebbecb6600
#: ../../source/index.rst:83 a36e02819ecd4b448a8cb4ebbecb6600
msgid "训练监控"
msgstr "Monitor"

#: ../../source/index.rst:83 b912e292486f455c8b5cdd75962e8ac2
#: ../../source/index.rst:91 b912e292486f455c8b5cdd75962e8ac2
msgid "训练样例"
msgstr "Example"

#: ../../source/index.rst:91 ea9e9281720941a1830e5df7a2badf7a
#: ../../source/index.rst:99 ea9e9281720941a1830e5df7a2badf7a
msgid "常见问题"
msgstr "Q&A"

#: ../../source/index.rst:99 e08edc5aa1c74965b10084b393b88fae
#: ../../source/index.rst:107 e08edc5aa1c74965b10084b393b88fae
msgid "索引和表格"
msgstr "Indices and tables"

#: ../../source/index.rst:101 f3fdca059caa49dcad09aa44be7f02d6
#: ../../source/index.rst:109 f3fdca059caa49dcad09aa44be7f02d6
msgid ":ref:`genindex`"
msgstr ""

#: ../../source/index.rst:102 b3791e811315435097bb507edc3f4b9b
#: ../../source/index.rst:110 b3791e811315435097bb507edc3f4b9b
msgid ":ref:`modindex`"
msgstr ""

#: ../../source/index.rst:103 a164b772960f4ab8b18c7e8820f69f55
#: ../../source/index.rst:111 a164b772960f4ab8b18c7e8820f69f55
msgid ":ref:`search`"
msgstr ""
Loading

0 comments on commit ab97724

Please sign in to comment.