Merge branch 'master' into gma/add_autotp_workflow

microsoft · Feb 5, 2024 · 552aa5b · 552aa5b
2 parents 33caddd + 3e6d606
commit 552aa5b
Show file tree

Hide file tree

Showing 60 changed files with 1,181 additions and 447 deletions.
diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml
@@ -1,9 +1,9 @@
 name: amd-mi200
 
 on:
+  workflow_dispatch:
   schedule:
     - cron: "0 0 * * *"
-  workflow_dispatch:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}

diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
@@ -1,6 +1,7 @@
 name: cpu-inference
 
 on:
+  workflow_dispatch:
   pull_request:
     paths:
       - '.github/workflows/cpu-inference.yml'
@@ -10,7 +11,6 @@ on:
       - '!deepspeed/inference/v2/**' # exclude v2 dir
       - 'tests/unit/inference/**'
       - '!tests/unit/inference/v2/**' # exclude v2 tests dir
-  workflow_dispatch:
   merge_group:
     branches: [ master ]
   schedule:

diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml
@@ -1,6 +1,7 @@
 name: Formatting
 
 on:
+  workflow_dispatch:
   pull_request:
     branches:
       '**'

diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
@@ -1,6 +1,7 @@
 name: nv-accelerate-v100
 
 on:
+  workflow_dispatch:
   pull_request:
     paths-ignore:
       - 'docs/**'
@@ -28,7 +29,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118
+          pip install -U --cache-dir $TORCH_CACHE torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu118
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 

diff --git a/.github/workflows/nv-h100.yml b/.github/workflows/nv-h100.yml
@@ -1,9 +1,9 @@
 name: nv-h100
 
 on:
+  workflow_dispatch:
   schedule:
     - cron: "0 0 * * *"
-  workflow_dispatch:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}

diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml
@@ -1,6 +1,7 @@
 name: nv-inference
 
 on:
+  workflow_dispatch:
   pull_request:
     paths:
       - '.github/workflows/nv-inference.yml'
@@ -39,7 +40,7 @@ jobs:
         run: |
           git clone https://github.com/huggingface/transformers
           cd transformers
-          git checkout f370bebdc
+          #git checkout f370bebdc
           git rev-parse --short HEAD
           pip install .
 
@@ -56,12 +57,6 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
-          coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="1.13" --cuda_ver="11.6"
-          coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
-
-      - name: Coverage report
-        run: |
-          cd tests
-          coverage combine
-          coverage report -m
+          pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
+          pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="1.13" --cuda_ver="11.6"
+          pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml
@@ -1,6 +1,7 @@
 name: nv-lightning-v100
 
 on:
+  workflow_dispatch:
   pull_request:
     paths-ignore:
       - 'docs/**'

diff --git a/.github/workflows/nv-megatron.yml b/.github/workflows/nv-megatron.yml
diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml
@@ -1,6 +1,7 @@
 name: nv-mii
 
 on:
+  workflow_dispatch:
   pull_request:
     paths:
       - '.github/workflows/nv-mii.yml'

diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml
@@ -1,6 +1,7 @@
 name: nv-nightly
 
 on:
+  workflow_dispatch:
   schedule:
     - cron: "0 0 * * *"
 
@@ -37,6 +38,10 @@ jobs:
           git rev-parse --short HEAD
           pip install .
 
+      - name: Install datasets
+        run: |
+          pip install datasets
+
       - name: Install deepspeed
         run: |
           pip install .[dev,1bit,autotuning,inf]

diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml
@@ -1,6 +1,7 @@
 name: nv-pre-compile-ops
 
 on:
+  workflow_dispatch:
   pull_request:
     branches:
       '**'

diff --git a/.github/workflows/nv-sd.yml b/.github/workflows/nv-sd.yml
@@ -1,9 +1,9 @@
 name: nv-sd
 
 on:
+  workflow_dispatch:
   schedule:
     - cron: "0 0 * * 0"
-  workflow_dispatch:
   pull_request:
     paths:
       - "deepspeed/ops/transformer/inference/diffusers_**"

diff --git a/.github/workflows/nv-torch-latest-cpu.yml b/.github/workflows/nv-torch-latest-cpu.yml
@@ -1,6 +1,7 @@
 name: nv-torch-latest-cpu
 
 on:
+  workflow_dispatch:
   pull_request:
     paths-ignore:
       - 'docs/**'

diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
@@ -1,6 +1,7 @@
 name: nv-torch-latest-v100
 
 on:
+  workflow_dispatch:
   pull_request:
     paths-ignore:
       - 'docs/**'
@@ -54,11 +55,5 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.1" --cuda_ver="11.8"
-          coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.1" --cuda_ver="11.8"
-
-      - name: Coverage report
-        run: |
-          cd tests
-          coverage combine
-          coverage report -m
+          pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.2" --cuda_ver="11.8"
+          pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.2" --cuda_ver="11.8"
diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml
@@ -1,6 +1,7 @@
 name: nv-torch-nightly-v100
 
 on:
+  workflow_dispatch:
   schedule:
     - cron: "0 0 * * *"
 

diff --git a/.github/workflows/nv-torch110-p40.yml b/.github/workflows/nv-torch110-p40.yml
@@ -1,9 +1,9 @@
 name: nv-torch110-p40
 
 on:
+  workflow_dispatch:
   schedule:
     - cron: "0 0 * * *"
-  workflow_dispatch:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -51,7 +51,7 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="1.10" --cuda_ver="11.1"
+          DS_ALLOW_DEPRECATED_FP16=1 pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="1.10" --cuda_ver="11.1"
 
       - name: Open GitHub issue if nightly CI fails
         if: ${{ failure() && (github.event_name == 'schedule') }}

diff --git a/.github/workflows/nv-torch110-v100.yml b/.github/workflows/nv-torch110-v100.yml
@@ -1,9 +1,9 @@
 name: nv-torch110-v100
 
 on:
+  workflow_dispatch:
   schedule:
     - cron: "0 0 * * *"
-  workflow_dispatch:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
@@ -1,6 +1,7 @@
 name: python
 
 on:
+  workflow_dispatch:
   pull_request:
     branches:
       '**'

diff --git a/README.md b/README.md
@@ -129,10 +129,10 @@ DeepSpeed has been integrated with several different popular open-source DL fram
 | Description | Status |
 | ----------- | ------ |
 | NVIDIA | [![nv-torch110-p40](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-p40.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-p40.yml) [![nv-torch110-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-v100.yml) [![nv-torch-latest-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml) [![nv-h100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-h100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-h100.yml) [![nv-inference](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml) [![nv-nightly](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml) |
-| AMD | [![amd-mi100](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi100.yml) [![amd-mi200](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml) |
+| AMD | [![amd-mi200](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml) |
 | CPU | [![nv-torch-latest-cpu](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-cpu.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-cpu.yml) |
 | PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) |
-| Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) [![nv-megatron](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-megatron.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-megatron.yml) [![nv-mii](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml) [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) [![nv-sd](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml) |
+| Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) [![nv-mii](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml) [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) [![nv-sd](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-sd.yml) |
 | Misc | [![Formatting](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)[![python](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml) |
 
 # Installation

diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py
@@ -180,17 +180,31 @@ def available_memory(self, device_index=None):
 
     # Data types
     def is_bf16_supported(self):
+        if not torch.cuda.is_available():
+            return True
         return torch.cuda.is_bf16_supported()
 
     def is_fp16_supported(self):
+        if not torch.cuda.is_available():
+            return True
+        # See https://docs.nvidia.com/deeplearning/tensorrt/support-matrix/index.html#hardware-precision-matrix
+        # FP16 on compute capability 6.x is deprecated
+        allow_deprecated_fp16 = os.environ.get('DS_ALLOW_DEPRECATED_FP16', '0') == '1'
         major, _ = torch.cuda.get_device_capability()
         if major >= 7:
             return True
+        elif major == 6 and allow_deprecated_fp16:
+            return True
         else:
             return False
 
     def supported_dtypes(self):
-        return [torch.float, torch.half, torch.bfloat16]
+        supported_dtypes = [torch.float]
+        if self.is_fp16_supported():
+            supported_dtypes.append(torch.half)
+        if self.is_bf16_supported():
+            supported_dtypes.append(torch.bfloat16)
+        return supported_dtypes
 
     # Misc
     def amp(self):

diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py
@@ -174,13 +174,13 @@ def is_triton_supported(self):
 
     # Graph operations
     def create_graph(self):
-        return None
+        return self.hpu.HPUGraph()
 
     def capture_to_graph(self, graph, pool=None, stream=None):
-        from deepspeed.runtime.utils import noop_context
-        return noop_context()
+        return self.hpu.graph(graph, stream=stream)
 
     def replay_graph(self, graph):
+        graph.replay()
         return
 
     # Tensor operations

diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py
@@ -286,7 +286,7 @@ def init_inference(model, config=None, **kwargs):
     .. code-block:: python
 
         generator.model = deepspeed.init_inference(generator.model,
-                                                    mp_size=world_size,
+                                                    tensor_parallel={"tp_size": world_size},
                                                     dtype=torch.half,
                                                     replace_with_kernel_inject=True)
         string = generator("DeepSpeed is")

diff --git a/deepspeed/autotuning/autotuner.py b/deepspeed/autotuning/autotuner.py
@@ -683,6 +683,7 @@ def model_info_profile_run(self):
         exp_config[DS_CONFIG] = ds_config
         exp_config['num_gpus'] = self.exp_num_gpus
         exp_config['num_nodes'] = self.exp_num_nodes
+        exp_config['hostfile'] = self.args.hostfile
         exp_path = os.path.join(self.exps_dir, f'{exp_name}.json')
 
         with open(exp_path, 'w', buffering=BUFSIZE) as fd:
@@ -761,6 +762,7 @@ def run_tuning_micro_batch_sizes(self, tuning_micro_batch_sizes, max_train_batch
             exp_config[DS_CONFIG] = ds_config
             exp_config['num_gpus'] = self.exp_num_gpus
             exp_config['num_nodes'] = self.exp_num_nodes
+            exp_config['hostfile'] = self.args.hostfile
             exp_path = os.path.join(self.exps_dir, f'{exp_name}.json')
 
             with open(exp_path, 'w', buffering=BUFSIZE) as fd:
@@ -1055,6 +1057,7 @@ def run_ds_config(self, ds_config, exp_name):
         exp_config[DS_CONFIG] = ds_config
         exp_config['num_gpus'] = self.exp_num_gpus
         exp_config['num_nodes'] = self.exp_num_nodes
+        exp_config['hostfile'] = self.args.hostfile
         exp_path = os.path.join(self.exps_dir, f'{exp_name}.json')
 
         logger.debug(f'run_ds_config exp_name = {exp_name}')

diff --git a/deepspeed/autotuning/scheduler.py b/deepspeed/autotuning/scheduler.py
@@ -316,7 +316,10 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
         include_str += f"{reservation.node.host}:{slots}@"
     include_str = include_str[:-1]
     master_port = exp["master_port"]
+    hostfile = exp["hostfile"]
     exp["launcher_args"] = [
+        "--hostfile",
+        f"{hostfile}",
         "--include",
         f"{include_str}",
         "--master_port",