Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
binmakeswell authored Feb 19, 2024
2 parents 3032bad + 705a62a commit 4606b55
Show file tree
Hide file tree
Showing 333 changed files with 6,945 additions and 10,035 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ jobs:
- name: Install Colossal-AI
run: |
CUDA_EXT=1 pip install -v -e .
BUILD_EXT=1 pip install -v -e .
pip install -r requirements/requirements-test.txt
- name: Store Colossal-AI Cache
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/build_on_schedule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
if: github.repository == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
timeout-minutes: 90
steps:
Expand Down Expand Up @@ -55,7 +55,7 @@ jobs:
if: steps.check-avai.outputs.avai == 'true'
run: |
[ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
CUDA_EXT=1 pip install -v -e .
BUILD_EXT=1 pip install -v -e .
cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
pip install -r requirements/requirements-test.txt
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/example_check_on_dispatch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ jobs:
fail-fast: false
matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}}
container:
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
options: --gpus all --rm -v /data/scratch/examples-data:/data/
timeout-minutes: 10
timeout-minutes: 15
steps:
- name: 📚 Checkout
uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/example_check_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ jobs:
fail-fast: false
matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}}
container:
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
options: --gpus all --rm -v /data/scratch/examples-data:/data/
timeout-minutes: 20
concurrency:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/example_check_on_schedule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
fail-fast: false
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
container:
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
timeout-minutes: 10
steps:
- name: 📚 Checkout
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/run_chatgpt_examples.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
options: --gpus all --rm -v /data/scratch/github_actions/chat:/data/scratch/github_actions/chat --shm-size=10.24gb
timeout-minutes: 30
defaults:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/run_chatgpt_unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
options: --gpus all --rm -v /data/scratch/chatgpt:/data/scratch/chatgpt
timeout-minutes: 30
defaults:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/run_colossalqa_unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
volumes:
- /data/scratch/test_data_colossalqa:/data/scratch/test_data_colossalqa
- /data/scratch/llama-tiny:/data/scratch/llama-tiny
Expand Down Expand Up @@ -51,4 +51,4 @@ jobs:
TEST_DATA_PATH_EN: /data/scratch/test_data_colossalqa/companies.txt
TEST_DATA_PATH_ZH: /data/scratch/test_data_colossalqa/companies_zh.txt
TEST_DOCUMENT_LOADER_DATA_PATH: /data/scratch/test_data_colossalqa/tests/*
SQL_FILE_PATH: /data/scratch/test_data_colossalqa/sql_file_path
SQL_FILE_PATH: /data/scratch/test_data_colossalqa/sql_file_path
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
include *.txt README.md
recursive-include requirements *.txt
recursive-include colossalai *.cpp *.h *.cu *.tr *.cuh *.cc *.pyi
recursive-include op_builder *.py
recursive-include extensions *.py *.cpp *.h *.cu *.tr *.cuh *.cc *.pyi
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -398,10 +398,10 @@ pip install colossalai

**Note: only Linux is supported for now.**

However, if you want to build the PyTorch extensions during installation, you can set `CUDA_EXT=1`.
However, if you want to build the PyTorch extensions during installation, you can set `BUILD_EXT=1`.

```bash
CUDA_EXT=1 pip install colossalai
BUILD_EXT=1 pip install colossalai
```

**Otherwise, CUDA kernels will be built during runtime when you actually need them.**
Expand Down Expand Up @@ -429,7 +429,7 @@ By default, we do not compile CUDA/C++ kernels. ColossalAI will build them durin
If you want to install and enable CUDA kernel fusion (compulsory installation when using fused optimizer):

```shell
CUDA_EXT=1 pip install .
BUILD_EXT=1 pip install .
```

For Users with CUDA 10.2, you can still build ColossalAI from source. However, you need to manually download the cub library and copy it to the corresponding directory.
Expand All @@ -445,7 +445,7 @@ unzip 1.8.0.zip
cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/

# install
CUDA_EXT=1 pip install .
BUILD_EXT=1 pip install .
```

<p align="right">(<a href="#top">back to top</a>)</p>
Expand Down
10 changes: 6 additions & 4 deletions applications/Chat/coati/dataset/sft_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,13 @@ def _preprocess(
max_length: int,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""Preprocess the data by tokenizing."""
sequences = [s + t for s, t in zip(sources, targets)]
sequences = [s + t + tokenizer.eos_token for s, t in zip(sources, targets)]
sequences_token = tokenizer(
sequences, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
sequences, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt", add_special_tokens=False
)

sources_token = tokenizer(
sources, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
sources, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt", add_special_tokens=False
)

assert sequences_token["attention_mask"].dim() == 2, "seq2seq model should be preprocessed differently"
Expand All @@ -65,7 +66,8 @@ def _preprocess(
if tokenizer.padding_side == "right":
# |prompt|completion|eos|pad|
labels[i][:source_len] = IGNORE_INDEX
labels[i][-pad_len:] = IGNORE_INDEX
if pad_len>0:
labels[i][-pad_len:] = IGNORE_INDEX
elif tokenizer.padding_side == "left":
# |pad|prompt|completion|eos|
labels[i][: pad_len + source_len] = IGNORE_INDEX
Expand Down
4 changes: 2 additions & 2 deletions applications/Chat/coati/trainer/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from tqdm import tqdm
from transformers import PreTrainedTokenizerBase

from colossalai.utils import get_current_device
from colossalai.accelerator import get_accelerator

from .base import OnPolicyTrainer
from .callbacks import Callback
Expand Down Expand Up @@ -105,7 +105,7 @@ def __init__(
self.critic_optim = critic_optim

self.offload_inference_models = offload_inference_models
self.device = get_current_device()
self.device = get_accelerator().get_current_device()

def _before_fit(
self,
Expand Down
13 changes: 11 additions & 2 deletions applications/Chat/coati/trainer/strategies/colossalai.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import colossalai
from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin
from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
from colossalai.utils import get_current_device
from colossalai.zero.gemini.gemini_ddp import GeminiDDP

from .ddp import DDPStrategy
Expand Down Expand Up @@ -158,9 +157,19 @@ def __init__(

warnings.warn(f"Stage 3 only supports fp16. Precision is set to fp16.")

# colossalai has changed api for get_current_device in 0.3.4 version or newer
try:
from colossalai.accelerator import get_accelerator

chunk_init_device = get_accelerator().get_current_device()
except:
from colossalai.utils import get_current_device

chunk_init_device = get_current_device()

# NOTE: dist should be initialized before calling get_current_device()
plugin_initializer = lambda: GeminiPlugin(
chunk_init_device=get_current_device(),
chunk_init_device=chunk_init_device,
placement_policy=placement_policy,
shard_param_frac=shard_param_frac,
offload_optim_frac=offload_optim_frac,
Expand Down
2 changes: 1 addition & 1 deletion applications/Chat/examples/train_sft.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
--accumulation_steps 8 \
--lr 2e-5 \
--max_datasets_size 512 \
--max_epochs 1
--max_epochs 1
68 changes: 10 additions & 58 deletions applications/Colossal-LLaMA-2/colossal_llama2/dataset/loader.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,16 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import numpy as np
import os
import random
from dataclasses import dataclass
from typing import Dict, List, Union, Sequence, Optional, Iterator, Callable
from typing import Dict, Iterator, List, Optional, Sequence, Union

import torch
from datasets import dataset_dict, load_from_disk
import torch.nn.functional as F
from datasets import Dataset as HFDataset
from torch.distributed import ProcessGroup
from torch.distributed.distributed_c10d import _get_default_group
from torch.utils.data import ConcatDataset, Dataset, DataLoader, DistributedSampler
from datasets import dataset_dict, load_from_disk
from torch.utils.data import ConcatDataset, Dataset, DistributedSampler
from transformers.tokenization_utils import PreTrainedTokenizer
import torch.nn.functional as F

DatasetType = Union[Dataset, ConcatDataset, dataset_dict.Dataset]
PathType = Union[str, os.PathLike]
Expand Down Expand Up @@ -62,6 +58,7 @@ class DataCollatorForSupervisedDataset(object):
tokenizer: PreTrainedTokenizer
max_length: int = 4096
ignore_index: int = -100
padding: str = "max_length"

def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch.Tensor]:
"""
Expand Down Expand Up @@ -106,10 +103,11 @@ def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch
batch_first=True,
padding_value=self.ignore_index,
) # (bsz, max_len)
# pad to max
to_pad = self.max_length - input_ids.size(1)
input_ids = F.pad(input_ids, (0, to_pad), value=self.tokenizer.pad_token_id)
labels = F.pad(labels, (0, to_pad), value=self.ignore_index)
if self.padding == "max_length":
# pad to max
to_pad = self.max_length - input_ids.size(1)
input_ids = F.pad(input_ids, (0, to_pad), value=self.tokenizer.pad_token_id)
labels = F.pad(labels, (0, to_pad), value=self.ignore_index)
elif self.tokenizer.padding_side == "left":
reversed_input_ids = [seq.flip(dims=(0,)) for seq in batch_input_ids]
reversed_input_ids = torch.nn.utils.rnn.pad_sequence(
Expand Down Expand Up @@ -171,49 +169,3 @@ def __len__(self) -> int:

def set_start_index(self, start_index: int) -> None:
self.start_index = start_index


def setup_distributed_dataloader(
dataset: DatasetType,
batch_size: int = 1,
shuffle: bool = False,
seed: int = 1024,
drop_last: bool = False,
pin_memory: bool = False,
num_workers: int = 0,
collate_fn: Callable[[Sequence[Dict[str, Union[str, List[int]]]]], Dict[str, torch.Tensor]] = None,
process_group: Optional[ProcessGroup] = None,
**kwargs,
) -> DataLoader:
"""
Setup dataloader for distributed training.
"""
_kwargs = kwargs.copy()
process_group = process_group or _get_default_group()
sampler = StatefulDistributedSampler(
dataset=dataset,
num_replicas=process_group.size(),
rank=process_group.rank(),
shuffle=shuffle,
seed=seed,
drop_last=drop_last,
)

# Deterministic dataloader
def seed_worker(worker_id: int) -> None:
worker_seed = seed
np.random.seed(worker_seed)
torch.manual_seed(worker_seed)
random.seed(worker_seed)

return DataLoader(
dataset=dataset,
batch_size=batch_size,
sampler=sampler,
num_workers=num_workers,
collate_fn=collate_fn,
pin_memory=pin_memory,
drop_last=drop_last,
worker_init_fn=seed_worker,
**_kwargs,
)
Loading

0 comments on commit 4606b55

Please sign in to comment.