merge commit

hpcaitech · Jan 31, 2024 · c565519 · c565519
2 parents 5f98a9d + 71321a0
commit c565519
Show file tree

Hide file tree

Showing 306 changed files with 4,052 additions and 9,119 deletions.
diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
@@ -140,7 +140,7 @@ jobs:
 
       - name: Install Colossal-AI
         run: |
-          CUDA_EXT=1 pip install -v -e .
+          BUILD_EXT=1 pip install -v -e .
           pip install -r requirements/requirements-test.txt
 
       - name: Store Colossal-AI Cache
@@ -160,9 +160,7 @@ jobs:
           --ignore tests/test_gptq \
           --ignore tests/test_infer_ops \
           --ignore tests/test_legacy \
-          --ignore tests/test_moe \
           --ignore tests/test_smoothquant \
-          --ignore tests/test_checkpoint_io \
           tests/
         env:
           LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64

diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml
@@ -12,7 +12,7 @@ jobs:
     if: github.repository == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
+      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
       options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
     timeout-minutes: 90
     steps:
@@ -23,6 +23,7 @@ jobs:
           ngpu=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
           endIndex=$(($ngpu-1))
           for i in $(seq 0 $endIndex);
+          do
             gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
             [ "$gpu_used" -gt "2000" ] && avai=false
           done
@@ -54,7 +55,7 @@ jobs:
         if: steps.check-avai.outputs.avai == 'true'
         run: |
           [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
-          CUDA_EXT=1 pip install -v -e .
+          BUILD_EXT=1 pip install -v -e .
           cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
           pip install -r requirements/requirements-test.txt
 

diff --git a/.github/workflows/example_check_on_dispatch.yml b/.github/workflows/example_check_on_dispatch.yml
@@ -45,9 +45,9 @@ jobs:
       fail-fast: false
       matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}}
     container:
-      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
+      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
       options: --gpus all --rm -v /data/scratch/examples-data:/data/
-    timeout-minutes: 10
+    timeout-minutes: 15
     steps:
       - name: 📚 Checkout
         uses: actions/checkout@v3

diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml
@@ -77,7 +77,7 @@ jobs:
       fail-fast: false
       matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}}
     container:
-      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
+      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
       options: --gpus all --rm -v /data/scratch/examples-data:/data/
     timeout-minutes: 20
     concurrency:

diff --git a/.github/workflows/example_check_on_schedule.yml b/.github/workflows/example_check_on_schedule.yml
@@ -34,7 +34,7 @@ jobs:
       fail-fast: false
       matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
     container:
-      image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
+      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
     timeout-minutes: 10
     steps:
       - name: 📚 Checkout

diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml
@@ -18,7 +18,7 @@ jobs:
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
       options: --gpus all --rm -v /data/scratch/github_actions/chat:/data/scratch/github_actions/chat --shm-size=10.24gb
     timeout-minutes: 30
     defaults:

diff --git a/.github/workflows/run_chatgpt_unit_tests.yml b/.github/workflows/run_chatgpt_unit_tests.yml
@@ -20,7 +20,7 @@ jobs:
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
       options: --gpus all --rm -v /data/scratch/chatgpt:/data/scratch/chatgpt
     timeout-minutes: 30
     defaults:

diff --git a/.github/workflows/run_colossalqa_unit_tests.yml b/.github/workflows/run_colossalqa_unit_tests.yml
@@ -19,7 +19,7 @@ jobs:
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
       volumes:
         - /data/scratch/test_data_colossalqa:/data/scratch/test_data_colossalqa
         - /data/scratch/llama-tiny:/data/scratch/llama-tiny
@@ -51,4 +51,4 @@ jobs:
           TEST_DATA_PATH_EN: /data/scratch/test_data_colossalqa/companies.txt
           TEST_DATA_PATH_ZH: /data/scratch/test_data_colossalqa/companies_zh.txt
           TEST_DOCUMENT_LOADER_DATA_PATH: /data/scratch/test_data_colossalqa/tests/*
-          SQL_FILE_PATH: /data/scratch/test_data_colossalqa/sql_file_path
+          SQL_FILE_PATH: /data/scratch/test_data_colossalqa/sql_file_path
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,4 +1,4 @@
 include *.txt README.md
 recursive-include requirements *.txt
 recursive-include colossalai *.cpp *.h *.cu *.tr *.cuh *.cc *.pyi
-recursive-include op_builder *.py
+recursive-include extensions *.py *.cpp *.h *.cu *.tr *.cuh *.cc *.pyi
diff --git a/README.md b/README.md
@@ -141,25 +141,26 @@ distributed training and inference in a few lines.
 [[HuggingFace model weights]](https://huggingface.co/hpcai-tech/Colossal-LLaMA-2-13b-base)
 [[Modelscope model weights]](https://www.modelscope.cn/models/colossalai/Colossal-LLaMA-2-13b-base/summary)
 
-|              Model             |  Backbone  | Tokens Consumed |     MMLU (5-shot)    | CMMLU (5-shot)| AGIEval (5-shot) | GAOKAO (0-shot) | CEval (5-shot)  |
-| :----------------------------: | :--------: | :-------------: | :------------------: | :-----------: | :--------------: | :-------------: | :-------------: |
-|          Baichuan-7B           |     -      |      1.2T       |    42.32 (42.30)     | 44.53 (44.02) |        38.72     |       36.74     |       42.80     |
-|       Baichuan-13B-Base        |     -      |      1.4T       |    50.51 (51.60)     | 55.73 (55.30) |        47.20     |       51.41     |       53.60     |
-|       Baichuan2-7B-Base        |     -      |      2.6T       |    46.97 (54.16)     | 57.67 (57.07) |        45.76     |       52.60     |       54.00     |
-|       Baichuan2-13B-Base       |     -      |      2.6T       |    54.84 (59.17)     | 62.62 (61.97) |        52.08     |       58.25     |       58.10     |
-|           ChatGLM-6B           |     -      |      1.0T       |    39.67 (40.63)     |   41.17 (-)   |        40.10     |       36.53     |       38.90     |
-|          ChatGLM2-6B           |     -      |      1.4T       |    44.74 (45.46)     |   49.40 (-)   |        46.36     |       45.49     |       51.70     |
-|          InternLM-7B           |     -      |      1.6T       |    46.70 (51.00)     |   52.00 (-)   |        44.77     |       61.64     |       52.80     |
-|            Qwen-7B             |     -      |      2.2T       |        54.29 (56.70) | 56.03 (58.80) |        52.47     |       56.42     |       59.60     |
-|           Llama-2-7B           |     -      |      2.0T       |    44.47 (45.30)     |   32.97 (-)   |        32.60     |       25.46     |         -       |
-| Linly-AI/Chinese-LLaMA-2-7B-hf | Llama-2-7B |      1.0T       |        37.43         |     29.92     |        32.00     |       27.57     |         -       |
-| wenge-research/yayi-7b-llama2  | Llama-2-7B |        -        |        38.56         |     31.52     |        30.99     |       25.95     |         -       |
-| ziqingyang/chinese-llama-2-7b  | Llama-2-7B |        -        |        33.86         |     34.69     |        34.52     |       25.18     |        34.2     |
-| TigerResearch/tigerbot-7b-base | Llama-2-7B |      0.3T       |        43.73         |     42.04     |        37.64     |       30.61     |         -       |
-|  LinkSoul/Chinese-Llama-2-7b   | Llama-2-7B |        -        |        48.41         |     38.31     |        38.45     |       27.72     |         -       |
-|       FlagAlpha/Atom-7B        | Llama-2-7B |      0.1T       |        49.96         |     41.10     |        39.83     |       33.00     |         -       |
-| IDEA-CCNL/Ziya-LLaMA-13B-v1.1  | Llama-13B  |      0.11T      |        50.25         |     40.99     |        40.04     |       30.54     |         -       |
-|  **Colossal-LLaMA-2-7b-base**  | Llama-2-7B |   **0.0085T**   |        53.06         |     49.89     |        51.48     |       58.82     |        50.2     |
+|              Model              |  Backbone  | Tokens Consumed |     MMLU (5-shot)    | CMMLU (5-shot)| AGIEval (5-shot) | GAOKAO (0-shot) | CEval (5-shot)  |
+| :-----------------------------: | :--------: | :-------------: | :------------------: | :-----------: | :--------------: | :-------------: | :-------------: |
+|          Baichuan-7B            |     -      |      1.2T       |    42.32 (42.30)     | 44.53 (44.02) |        38.72     |       36.74     |       42.80     |
+|       Baichuan-13B-Base         |     -      |      1.4T       |    50.51 (51.60)     | 55.73 (55.30) |        47.20     |       51.41     |       53.60     |
+|       Baichuan2-7B-Base         |     -      |      2.6T       |    46.97 (54.16)     | 57.67 (57.07) |        45.76     |       52.60     |       54.00     |
+|       Baichuan2-13B-Base        |     -      |      2.6T       |    54.84 (59.17)     | 62.62 (61.97) |        52.08     |       58.25     |       58.10     |
+|           ChatGLM-6B            |     -      |      1.0T       |    39.67 (40.63)     |   41.17 (-)   |        40.10     |       36.53     |       38.90     |
+|          ChatGLM2-6B            |     -      |      1.4T       |    44.74 (45.46)     |   49.40 (-)   |        46.36     |       45.49     |       51.70     |
+|          InternLM-7B            |     -      |      1.6T       |    46.70 (51.00)     |   52.00 (-)   |        44.77     |       61.64     |       52.80     |
+|            Qwen-7B              |     -      |      2.2T       |    54.29 (56.70)     | 56.03 (58.80) |        52.47     |       56.42     |       59.60     |
+|           Llama-2-7B            |     -      |      2.0T       |    44.47 (45.30)     |   32.97 (-)   |        32.60     |       25.46     |         -       |
+| Linly-AI/Chinese-LLaMA-2-7B-hf  | Llama-2-7B |      1.0T       |        37.43         |     29.92     |        32.00     |       27.57     |         -       |
+| wenge-research/yayi-7b-llama2   | Llama-2-7B |        -        |        38.56         |     31.52     |        30.99     |       25.95     |         -       |
+| ziqingyang/chinese-llama-2-7b   | Llama-2-7B |        -        |        33.86         |     34.69     |        34.52     |       25.18     |        34.2     |
+| TigerResearch/tigerbot-7b-base  | Llama-2-7B |      0.3T       |        43.73         |     42.04     |        37.64     |       30.61     |         -       |
+|  LinkSoul/Chinese-Llama-2-7b    | Llama-2-7B |        -        |        48.41         |     38.31     |        38.45     |       27.72     |         -       |
+|       FlagAlpha/Atom-7B         | Llama-2-7B |      0.1T       |        49.96         |     41.10     |        39.83     |       33.00     |         -       |
+| IDEA-CCNL/Ziya-LLaMA-13B-v1.1   | Llama-13B  |      0.11T      |        50.25         |     40.99     |        40.04     |       30.54     |         -       |
+|  **Colossal-LLaMA-2-7b-base**   | Llama-2-7B |   **0.0085T**   |        53.06         |     49.89     |        51.48     |       58.82     |        50.2     |
+|  **Colossal-LLaMA-2-13b-base**  | Llama-2-13B |   **0.025T**    |        56.42         |     61.80     |        54.69     |       69.53     |        60.3     |
 
 
 ### ColossalChat

diff --git a/applications/Chat/coati/trainer/ppo.py b/applications/Chat/coati/trainer/ppo.py
@@ -10,7 +10,7 @@
 from tqdm import tqdm
 from transformers import PreTrainedTokenizerBase
 
-from colossalai.utils import get_current_device
+from colossalai.accelerator import get_accelerator
 
 from .base import OnPolicyTrainer
 from .callbacks import Callback
@@ -105,7 +105,7 @@ def __init__(
         self.critic_optim = critic_optim
 
         self.offload_inference_models = offload_inference_models
-        self.device = get_current_device()
+        self.device = get_accelerator().get_current_device()
 
     def _before_fit(
         self,

diff --git a/applications/Chat/coati/trainer/strategies/colossalai.py b/applications/Chat/coati/trainer/strategies/colossalai.py
@@ -6,7 +6,6 @@
 import colossalai
 from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin
 from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
-from colossalai.utils import get_current_device
 from colossalai.zero.gemini.gemini_ddp import GeminiDDP
 
 from .ddp import DDPStrategy
@@ -158,9 +157,19 @@ def __init__(
 
         warnings.warn(f"Stage 3 only supports fp16. Precision is set to fp16.")
 
+        # colossalai has changed api for get_current_device in 0.3.4 version or newer
+        try:
+            from colossalai.accelerator import get_accelerator
+
+            chunk_init_device = get_accelerator().get_current_device()
+        except:
+            from colossalai.utils import get_current_device
+
+            chunk_init_device = get_current_device()
+
         # NOTE: dist should be initialized before calling get_current_device()
         plugin_initializer = lambda: GeminiPlugin(
-            chunk_init_device=get_current_device(),
+            chunk_init_device=chunk_init_device,
             placement_policy=placement_policy,
             shard_param_frac=shard_param_frac,
             offload_optim_frac=offload_optim_frac,

diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/tokenizer/init_tokenizer.py b/applications/Colossal-LLaMA-2/colossal_llama2/tokenizer/init_tokenizer.py
@@ -6,12 +6,12 @@
 """
 
 import argparse
-import os
 import json
+import os
 from typing import List, Union
 
-from transformers.models.llama.tokenization_llama import LlamaTokenizer
 from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
+from transformers.models.llama.tokenization_llama import LlamaTokenizer
 
 from colossalai.logging import get_dist_logger
 

diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/utils/neftune_patch.py b/applications/Colossal-LLaMA-2/colossal_llama2/utils/neftune_patch.py
@@ -16,7 +16,10 @@
 
 
 def unwrap(model):
-    return model.unwrap().module
+    if hasattr(model, "module"):
+        return unwrap_model(model.module)
+    else:
+        return model
 
 
 def neftune_post_forward_hook(module, input, output):

diff --git a/applications/Colossal-LLaMA-2/train.py b/applications/Colossal-LLaMA-2/train.py
@@ -1,44 +1,37 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-Continual Pre-training of LLaMA-2 developed by Colossal-AI Team 
+Continual Pre-training of LLaMA-2 developed by Colossal-AI Team
 """
 
-import json
 import argparse
+import json
 import os
 import resource
 from contextlib import nullcontext
-from tqdm import tqdm
 
 import torch
 import torch.distributed as dist
+from colossal_llama2.dataset.loader import (
+    DataCollatorForSupervisedDataset,
+    StatefulDistributedSampler,
+    load_tokenized_dataset,
+    setup_distributed_dataloader,
+)
+from colossal_llama2.utils.ckpt_io import load_checkpoint, save_checkpoint
+from colossal_llama2.utils.flash_attention_patch import replace_with_flash_attention
+from colossal_llama2.utils.froze import freeze_non_embeds_parameters
 from torch.utils.tensorboard import SummaryWriter
-from transformers import LlamaTokenizer, LlamaForCausalLM, LlamaConfig
+from tqdm import tqdm
+from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
 
 import colossalai
 from colossalai.booster import Booster
-from colossalai.booster.plugin import (
-    GeminiPlugin,
-    LowLevelZeroPlugin,
-    HybridParallelPlugin,
-)
+from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
 from colossalai.cluster import DistCoordinator
 from colossalai.lazy import LazyInitContext
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.nn.optimizer import HybridAdam
-from colossalai.utils import get_current_device
-
-from colossal_llama2.dataset.loader import (
-    load_tokenized_dataset,
-    setup_distributed_dataloader,
-    DataCollatorForSupervisedDataset,
-    StatefulDistributedSampler,
-)
-
-from colossal_llama2.utils.flash_attention_patch import replace_with_flash_attention
-from colossal_llama2.utils.ckpt_io import load_checkpoint, save_checkpoint
-from colossal_llama2.utils.froze import freeze_non_embeds_parameters
 
 
 def get_model_numel(model: torch.nn.Module) -> int:
@@ -215,9 +208,18 @@ def main() -> None:
     # ======================================================
     # Initialize Model, Objective, Optimizer and LR Scheduler
     # ======================================================
-    init_ctx = (
-        LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext()
-    )
+
+    # colossalai has changed api for get_current_device in 0.3.4 version or newer
+    try:
+        from colossalai.accelerator import get_accelerator
+
+        current_device = get_accelerator().get_current_device()
+    except:
+        from colossalai.utils import get_current_device
+
+        current_device = get_current_device()
+
+    init_ctx = LazyInitContext(default_device=current_device) if isinstance(plugin, (GeminiPlugin,)) else nullcontext()
     with init_ctx:
         model = LlamaForCausalLM(LlamaConfig.from_pretrained(args.pretrained))
         # Freeze part of parameters.
@@ -320,7 +322,7 @@ def main() -> None:
             initial=start_step,
         ) as pbar:
             for step, batch in pbar:
-                batch = {k: v.to(get_current_device()) for k, v in batch.items() if isinstance(v, torch.Tensor)}
+                batch = {k: v.to(current_device) for k, v in batch.items() if isinstance(v, torch.Tensor)}
 
                 batch_output = model(**batch)
 
@@ -372,9 +374,7 @@ def main() -> None:
     # Final save.
     coordinator.print_on_master("Start saving final model checkpoint")
     booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True)
-    coordinator.print_on_master(
-        f"Saved final model checkpoint at epoch {epoch} at folder {args.save_dir}"
-    )
+    coordinator.print_on_master(f"Saved final model checkpoint at epoch {epoch} at folder {args.save_dir}")
 
     coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
 

diff --git a/applications/ColossalQA/colossalqa/local/llm.py b/applications/ColossalQA/colossalqa/local/llm.py
@@ -136,6 +136,19 @@ def _identifying_params(self) -> Mapping[str, int]:
         """Get the identifying parameters."""
         return {"n": self.n}
 
+    def get_token_ids(self, text: str) -> List[int]:
+        """Return the ordered ids of the tokens in a text.
+
+        Args:
+            text: The string input to tokenize.
+
+        Returns:
+            A list of ids corresponding to the tokens in the text, in order they occur
+                in the text.
+        """
+        # use the colossal llm's tokenizer instead of langchain's cached GPT2 tokenizer
+        return self.api.tokenizer.encode(text)
+
 
 class VllmLLM(LLM):
     """

diff --git a/colossalai/__init__.py b/colossalai/__init__.py
@@ -1,4 +1,5 @@
 from .initialize import launch, launch_from_openmpi, launch_from_slurm, launch_from_torch
+from . import accelerator
 
 try:
     # .version will be created by setup.py

diff --git a/colossalai/accelerator/README.md b/colossalai/accelerator/README.md
@@ -0,0 +1,20 @@
+# 🚀 Accelerator
+
+## 🔗 Table of Contents
+
+- [🚀 Accelerator](#-accelerator)
+  - [🔗 Table of Contents](#-table-of-contents)
+  - [📚 Introduction](#-introduction)
+  - [📌 Design and Acknowledgement](#-design-and-acknowledgement)
+
+## 📚 Introduction
+
+This module offers a layer of abstraction for ColossalAI. With this module, the user can easily switch between different accelerator backends, such as Nvidia GPUs, Huawei NPUs, etc. This module is an attempt to make users' code portable across different hardware platform with a simple `auto_set_accelerator()` API.
+
+## 📌 Design and Acknowledgement
+
+Our `accelerator` module is heavily inspired by [`deepspeed/accelerator`](https://www.deepspeed.ai/tutorials/accelerator-abstraction-interface/). We found that it is a very well-designed and well-structured module that can be easily integrated into our project. We would like to thank the DeepSpeed team for their great work.
+
+We implemented this accelerator module from scratch. At the same time, we have implemented our own modifications:
+1. we updated the accelerator API names to be aligned with PyTorch's native API names.
+2. we did not include the `op builder` in the `accelerator`. Instead, we have reconstructed our `kernel` module to automatically match the accelerator and its corresponding kernel implementations, so as to make modules less tangled.