feat(configs): resolve conflicts from merging feat/fstp_refactor

yingtongxiong · Oct 26, 2023 · e04a61a · e04a61a
2 parents 6389984 + aa3840f
commit e04a61a
Show file tree

Hide file tree

Showing 8 changed files with 77 additions and 9 deletions.
diff --git a/configs/13B_template.py b/configs/13B_template.py
@@ -56,15 +56,15 @@
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
     valid_every=50,
-    pack_sample_into_one=False,
+    pack_sample_into_one=True,
     total_steps=20,
     skip_batches="",
     rampup_batch_size="",
     # Datasets with less than 50 rows will be discarded
     min_length=50,
     # train_folder=TRAIN_FOLDER,
     # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
+    empty_cache_and_diag_interval=100,
     diag_outlier_ratio=1.1,
 )
 

diff --git a/configs/30B_template.py b/configs/30B_template.py
@@ -1,7 +1,7 @@
 DO_ALERT = False
 
 SEQ_LEN = 4096
-JOB_NAME = "7b_train_" + str({micro_bsz}) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint})
+JOB_NAME = "30b_train_" + str({micro_bsz}) + "_" + str({sp}) + "_" + str({intern_overlap}) + "_" + str({checkpoint})
 HIDDEN_SIZE = 6144
 NUM_ATTENTION_HEAD = 48
 MLP_RATIO = 8 / 3
@@ -56,15 +56,15 @@
     valid_micro_num=4,
     # defaults to 0, means disable evaluate
     valid_every=50,
-    pack_sample_into_one=False,
+    pack_sample_into_one=True,
     total_steps=20,
     skip_batches="",
     rampup_batch_size="",
     # Datasets with less than 50 rows will be discarded
     min_length=50,
     # train_folder=TRAIN_FOLDER,
     # valid_folder=VALID_FOLDER,
-    empty_cache_and_diag_interval=10,
+    empty_cache_and_diag_interval=100,
     diag_outlier_ratio=1.1,
 )
 

diff --git a/configs/7B_sft.py b/configs/7B_sft.py
@@ -57,7 +57,7 @@
     # defaults to 0, means disable evaluate
     valid_every=50,
     pack_sample_into_one=True,
-    total_steps=20,
+    total_steps=50,
     skip_batches="",
     rampup_batch_size="",
     # Datasets with less than 50 rows will be discarded
@@ -163,7 +163,7 @@
 """
 parallel = dict(
     zero1=dict(size=-1, fsdp=False),
-    tensor=dict(size=8, sp="none", intern_overlap=False),
+    tensor=dict(size=8, sp="intern", intern_overlap=True),
     pipeline=dict(size=1, interleaved_overlap=True),
 )
 

diff --git a/configs/generate.py b/configs/generate.py
@@ -47,6 +47,7 @@
 
                     log_name = root_name + "_" + output_file_name[:-3]
 
+                    print(log_name)
                     command = f"srun -p llm_s -N 8 -n 64 --ntasks-per-node=8 --gpus-per-task=1 --time=30 python train.py --config {write_file} --profiling 2>&1 | tee ./fstp_logs/{log_name}.log"
                     process = subprocess.Popen(command, shell=True, executable="/bin/bash")
                     process.wait()
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -856,6 +856,8 @@ def broadcast_params(self):
         for handle in handles:
             handle.wait()
 
+        torch.cuda.synchronize()
+
     ##################
     # FP16 Utilities #
     ##################

diff --git a/internlm/train/training_internlm.py b/internlm/train/training_internlm.py
@@ -406,11 +406,13 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None):
 
 tgs_list = []
 tflops_list = []
+tflops_list_2 = []
 
 
 @llm_timeout(func_name="record_current_batch_training_metrics")
 def record_current_batch_training_metrics(
     get_tflops_func,
+    get_tflops_func_2,
     logger,
     writer,
     success_update,
@@ -495,6 +497,7 @@ def record_current_batch_training_metrics(
         tgs_SMA = round(tgs_statistic["SMA_tg_50"] / tgs_statistic["SMA_time_50"], 2)
 
         tflops = get_tflops_func((time.time() - start_time))
+        tflops_2 = get_tflops_func_2((time.time() - start_time))
 
         tgs_origin = round(
             num_tokens_in_batch
@@ -506,6 +509,7 @@ def record_current_batch_training_metrics(
 
         infos = {
             "tflops": tflops,
+            "tflops2": tflops_2,
             "step": batch_count,
             "loss": loss.item() - moe_loss.item() if moe_loss is not None else loss.item(),
             "tgs (tokens/gpu/second)": tgs_origin,
@@ -599,16 +603,25 @@ def record_current_batch_training_metrics(
         if batch_count >= 5:
             tgs_list.append(tgs_origin)
             tflops_list.append(tflops)
+            tflops_list_2.append(tflops_2)
         if batch_count == gpc.config.data.total_steps - 1:
             print(tgs_list, flush=True)
             avg_tgs = sum(tgs_list) / len(tgs_list)
             for tgs in tgs_list.copy():
                 if abs(tgs - avg_tgs) > 400:
                     tgs_list.remove(tgs)
             print(f"avg_tgs: {sum(tgs_list)/len(tgs_list)}", flush=True)
+
             print(tflops_list, flush=True)
             avg_tflops = sum(tflops_list) / len(tflops_list)
             for tf in tflops_list.copy():
                 if abs(tf - avg_tflops) > 10:
                     tflops_list.remove(tf)
             print(f"avg_tflops: {sum(tflops_list)/len(tflops_list)}", flush=True)
+
+            print(tflops_list_2, flush=True)
+            avg_tflops_2 = sum(tflops_list_2) / len(tflops_list_2)
+            for tf in tflops_list_2.copy():
+                if abs(tf - avg_tflops_2) > 10:
+                    tflops_list_2.remove(tf)
+            print(f"avg_tflops_2: {sum(tflops_list_2)/len(tflops_list_2)}", flush=True)
diff --git a/internlm/utils/common.py b/internlm/utils/common.py
@@ -220,6 +220,43 @@ def get_megatron_flops(
     return tflops
 
 
+def get_megatron_flops_2(
+    elapsed_time_per_iter,
+    checkpoint=False,
+    seq_len=2048,
+    hidden_size=12,
+    num_layers=32,
+    vocab_size=12,
+    global_batch_size=4,
+    global_world_size=1,
+    mlp_ratio=4,
+    use_swiglu=True,
+):
+    """
+    Calc flops based on the paper of Megatron https://deepakn94.github.io/assets/papers/megatron-sc21.pdf
+    """
+
+    checkpoint_activations_factor = 4 if checkpoint else 3
+    flashattn_activations_factor = 4.5 if checkpoint else 3.5
+
+    if use_swiglu:
+        mlp_ratio = mlp_ratio * 3 / 2
+
+    flops_per_iteration = (
+        checkpoint_activations_factor
+        * (8 + mlp_ratio * 4)
+        * global_batch_size
+        * seq_len
+        * hidden_size**2
+        * num_layers
+        + 4 * global_batch_size * seq_len**2 * hidden_size * num_layers * flashattn_activations_factor
+        + 6 * global_batch_size * seq_len * hidden_size * vocab_size
+    )
+
+    tflops = flops_per_iteration / (elapsed_time_per_iter * global_world_size * (10**12))
+    return tflops
+
+
 class DummyProfile:
     """
     Dummy Profile.

diff --git a/train.py b/train.py
@@ -33,6 +33,7 @@
 from internlm.utils.common import (
     BatchSkipper,
     get_megatron_flops,
+    get_megatron_flops_2,
     launch_time,
     parse_args,
 )
@@ -111,6 +112,18 @@ def main(args):
         global_world_size=gpc.get_world_size(ParallelMode.GLOBAL),
         mlp_ratio=gpc.config.MLP_RATIO,
     )
+
+    get_tflops_func_2 = partial(
+        get_megatron_flops_2,
+        checkpoint=gpc.config.model.checkpoint,
+        seq_len=gpc.config.SEQ_LEN,
+        hidden_size=gpc.config.model.hidden_size,
+        num_layers=gpc.config.model.num_layers,
+        vocab_size=gpc.config.model.vocab_size,
+        global_batch_size=gpc.config.data.micro_bsz * gpc.config.data.micro_num * gpc.get_world_size(ParallelMode.DATA),
+        global_world_size=gpc.get_world_size(ParallelMode.GLOBAL),
+        mlp_ratio=gpc.config.MLP_RATIO,
+    )
 
     # get and broadcast current time
     current_time = launch_time()
@@ -271,6 +284,7 @@ def main(args):
             # calculate and record the training metrics, eg. loss, accuracy and so on.
             record_current_batch_training_metrics(
                 get_tflops_func=get_tflops_func,
+                get_tflops_func_2=get_tflops_func_2,
                 logger=logger,
                 writer=writer,
                 success_update=success_update,
@@ -309,8 +323,9 @@ def main(args):
 
             if memory_profiler is not None:
                 memory_profiler.step()
-
-            prof.step()
+
+            if batch_count % 2 == 0:
+                prof.step()
 
             if gpc.fstp_handler is not None:
                 gpc.fstp_handler.clear_memory_pool()