Skip to content

Commit

Permalink
Merge branch 'master' into gma/add_autotp_workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
loadams authored Feb 5, 2024
2 parents 33caddd + 3e6d606 commit 552aa5b
Show file tree
Hide file tree
Showing 60 changed files with 1,181 additions and 447 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/amd-mi200.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
name: amd-mi200

on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/cpu-inference.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: cpu-inference

on:
workflow_dispatch:
pull_request:
paths:
- '.github/workflows/cpu-inference.yml'
Expand All @@ -10,7 +11,6 @@ on:
- '!deepspeed/inference/v2/**' # exclude v2 dir
- 'tests/unit/inference/**'
- '!tests/unit/inference/v2/**' # exclude v2 tests dir
workflow_dispatch:
merge_group:
branches: [ master ]
schedule:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/formatting.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: Formatting

on:
workflow_dispatch:
pull_request:
branches:
'**'
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/nv-accelerate-v100.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: nv-accelerate-v100

on:
workflow_dispatch:
pull_request:
paths-ignore:
- 'docs/**'
Expand Down Expand Up @@ -28,7 +29,7 @@ jobs:

- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118
pip install -U --cache-dir $TORCH_CACHE torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu118
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-h100.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
name: nv-h100

on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
Expand Down
15 changes: 5 additions & 10 deletions .github/workflows/nv-inference.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: nv-inference

on:
workflow_dispatch:
pull_request:
paths:
- '.github/workflows/nv-inference.yml'
Expand Down Expand Up @@ -39,7 +40,7 @@ jobs:
run: |
git clone https://github.com/huggingface/transformers
cd transformers
git checkout f370bebdc
#git checkout f370bebdc
git rev-parse --short HEAD
pip install .
Expand All @@ -56,12 +57,6 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="1.13" --cuda_ver="11.6"
coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
- name: Coverage report
run: |
cd tests
coverage combine
coverage report -m
pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="1.13" --cuda_ver="11.6"
pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
1 change: 1 addition & 0 deletions .github/workflows/nv-lightning-v100.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: nv-lightning-v100

on:
workflow_dispatch:
pull_request:
paths-ignore:
- 'docs/**'
Expand Down
63 changes: 0 additions & 63 deletions .github/workflows/nv-megatron.yml

This file was deleted.

1 change: 1 addition & 0 deletions .github/workflows/nv-mii.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: nv-mii

on:
workflow_dispatch:
pull_request:
paths:
- '.github/workflows/nv-mii.yml'
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/nv-nightly.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: nv-nightly

on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"

Expand Down Expand Up @@ -37,6 +38,10 @@ jobs:
git rev-parse --short HEAD
pip install .
- name: Install datasets
run: |
pip install datasets
- name: Install deepspeed
run: |
pip install .[dev,1bit,autotuning,inf]
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/nv-pre-compile-ops.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: nv-pre-compile-ops

on:
workflow_dispatch:
pull_request:
branches:
'**'
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-sd.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
name: nv-sd

on:
workflow_dispatch:
schedule:
- cron: "0 0 * * 0"
workflow_dispatch:
pull_request:
paths:
- "deepspeed/ops/transformer/inference/diffusers_**"
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/nv-torch-latest-cpu.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: nv-torch-latest-cpu

on:
workflow_dispatch:
pull_request:
paths-ignore:
- 'docs/**'
Expand Down
11 changes: 3 additions & 8 deletions .github/workflows/nv-torch-latest-v100.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: nv-torch-latest-v100

on:
workflow_dispatch:
pull_request:
paths-ignore:
- 'docs/**'
Expand Down Expand Up @@ -54,11 +55,5 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.1" --cuda_ver="11.8"
coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.1" --cuda_ver="11.8"
- name: Coverage report
run: |
cd tests
coverage combine
coverage report -m
pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.2" --cuda_ver="11.8"
pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.2" --cuda_ver="11.8"
1 change: 1 addition & 0 deletions .github/workflows/nv-torch-nightly-v100.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: nv-torch-nightly-v100

on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/nv-torch110-p40.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
name: nv-torch110-p40

on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
Expand Down Expand Up @@ -51,7 +51,7 @@ jobs:
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="1.10" --cuda_ver="11.1"
DS_ALLOW_DEPRECATED_FP16=1 pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="1.10" --cuda_ver="11.1"
- name: Open GitHub issue if nightly CI fails
if: ${{ failure() && (github.event_name == 'schedule') }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-torch110-v100.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
name: nv-torch110-v100

on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/python.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: python

on:
workflow_dispatch:
pull_request:
branches:
'**'
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,10 +129,10 @@ DeepSpeed has been integrated with several different popular open-source DL fram
| Description | Status |
| ----------- | ------ |
| NVIDIA | [![nv-torch110-p40](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-p40.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-p40.yml) [![nv-torch110-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-v100.yml) [![nv-torch-latest-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml) [![nv-h100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-h100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-h100.yml) [![nv-inference](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml) [![nv-nightly](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml) |
| AMD | [![amd-mi100](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi100.yml) [![amd-mi200](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml) |
| AMD | [![amd-mi200](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml) |
| CPU | [![nv-torch-latest-cpu](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-cpu.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-cpu.yml) |
| PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) |
| Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) [![nv-megatron](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-megatron.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-megatron.yml) [![nv-mii](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml) [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) [![nv-sd](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml) |
| Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) [![nv-mii](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml) [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) [![nv-sd](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml) |
| Misc | [![Formatting](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)[![python](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml) |

# Installation
Expand Down
16 changes: 15 additions & 1 deletion accelerator/cuda_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,17 +180,31 @@ def available_memory(self, device_index=None):

# Data types
def is_bf16_supported(self):
if not torch.cuda.is_available():
return True
return torch.cuda.is_bf16_supported()

def is_fp16_supported(self):
if not torch.cuda.is_available():
return True
# See https://docs.nvidia.com/deeplearning/tensorrt/support-matrix/index.html#hardware-precision-matrix
# FP16 on compute capability 6.x is deprecated
allow_deprecated_fp16 = os.environ.get('DS_ALLOW_DEPRECATED_FP16', '0') == '1'
major, _ = torch.cuda.get_device_capability()
if major >= 7:
return True
elif major == 6 and allow_deprecated_fp16:
return True
else:
return False

def supported_dtypes(self):
return [torch.float, torch.half, torch.bfloat16]
supported_dtypes = [torch.float]
if self.is_fp16_supported():
supported_dtypes.append(torch.half)
if self.is_bf16_supported():
supported_dtypes.append(torch.bfloat16)
return supported_dtypes

# Misc
def amp(self):
Expand Down
6 changes: 3 additions & 3 deletions accelerator/hpu_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,13 +174,13 @@ def is_triton_supported(self):

# Graph operations
def create_graph(self):
return None
return self.hpu.HPUGraph()

def capture_to_graph(self, graph, pool=None, stream=None):
from deepspeed.runtime.utils import noop_context
return noop_context()
return self.hpu.graph(graph, stream=stream)

def replay_graph(self, graph):
graph.replay()
return

# Tensor operations
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def init_inference(model, config=None, **kwargs):
.. code-block:: python
generator.model = deepspeed.init_inference(generator.model,
mp_size=world_size,
tensor_parallel={"tp_size": world_size},
dtype=torch.half,
replace_with_kernel_inject=True)
string = generator("DeepSpeed is")
Expand Down
3 changes: 3 additions & 0 deletions deepspeed/autotuning/autotuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -683,6 +683,7 @@ def model_info_profile_run(self):
exp_config[DS_CONFIG] = ds_config
exp_config['num_gpus'] = self.exp_num_gpus
exp_config['num_nodes'] = self.exp_num_nodes
exp_config['hostfile'] = self.args.hostfile
exp_path = os.path.join(self.exps_dir, f'{exp_name}.json')

with open(exp_path, 'w', buffering=BUFSIZE) as fd:
Expand Down Expand Up @@ -761,6 +762,7 @@ def run_tuning_micro_batch_sizes(self, tuning_micro_batch_sizes, max_train_batch
exp_config[DS_CONFIG] = ds_config
exp_config['num_gpus'] = self.exp_num_gpus
exp_config['num_nodes'] = self.exp_num_nodes
exp_config['hostfile'] = self.args.hostfile
exp_path = os.path.join(self.exps_dir, f'{exp_name}.json')

with open(exp_path, 'w', buffering=BUFSIZE) as fd:
Expand Down Expand Up @@ -1055,6 +1057,7 @@ def run_ds_config(self, ds_config, exp_name):
exp_config[DS_CONFIG] = ds_config
exp_config['num_gpus'] = self.exp_num_gpus
exp_config['num_nodes'] = self.exp_num_nodes
exp_config['hostfile'] = self.args.hostfile
exp_path = os.path.join(self.exps_dir, f'{exp_name}.json')

logger.debug(f'run_ds_config exp_name = {exp_name}')
Expand Down
3 changes: 3 additions & 0 deletions deepspeed/autotuning/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,10 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
include_str += f"{reservation.node.host}:{slots}@"
include_str = include_str[:-1]
master_port = exp["master_port"]
hostfile = exp["hostfile"]
exp["launcher_args"] = [
"--hostfile",
f"{hostfile}",
"--include",
f"{include_str}",
"--master_port",
Expand Down
Loading

0 comments on commit 552aa5b

Please sign in to comment.