Merge branch 'main' into calc-packing-eff-across-all-ranks

axolotl-ai-cloud · Sep 12, 2023 · 6237305 · 6237305
2 parents d45b5d3 + 772cd87
commit 6237305
Show file tree

Hide file tree

Showing 44 changed files with 2,245 additions and 564 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -23,11 +23,6 @@ jobs:
             python_version: "3.10"
             pytorch: 2.0.1
             axolotl_extras:
-          - cuda: 118
-            cuda_version: 11.8.0
-            python_version: "3.9"
-            pytorch: 2.0.1
-            axolotl_extras: gptq
     runs-on: self-hosted
     steps:
       - name: Checkout
@@ -73,11 +68,6 @@ jobs:
             pytorch: 2.0.1
             axolotl_extras:
             is_latest: true
-          - cuda: 118
-            cuda_version: 11.8.0
-            python_version: "3.9"
-            pytorch: 2.0.1
-            axolotl_extras: gptq
     runs-on: self-hosted
     steps:
       - name: Checkout

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
@@ -0,0 +1,45 @@
+name: publish pypi
+
+on:
+  push:
+    tags:
+      - '*'
+
+jobs:
+  pypi-publish:
+    name: Upload release to PyPI
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/axolotl
+    permissions:
+      id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v3
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+
+      - name: Install dependencies
+        run: |
+          pip3 install wheel
+          pip3 install -e .
+          pip3 install -r requirements-tests.txt
+
+      - name: Extract tag name
+        id: tag
+        run: echo ::set-output name=TAG_NAME::$(echo $GITHUB_REF | cut -d / -f 3)
+
+      - name: Update version in setup.py
+        run: >-
+          sed -i -E 's/version="([0-9.]+)",/version="${{ steps.tag.outputs.TAG_NAME }}",/g' setup.py
+
+      - name: Build a binary wheel
+        run: >-
+          python setup.py sdist bdist_wheel
+
+      - name: Publish package distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -24,8 +24,8 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install -e .[peft]
-          pip install -r requirements-tests.txt
+          pip3 install -e .
+          pip3 install -r requirements-tests.txt
 
       - name: Run tests
         run: |

diff --git a/README.md b/README.md
@@ -90,8 +90,7 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \
   ```bash
   docker run --gpus '"all"' --rm -it winglian/axolotl:main-py3.10-cu118-2.0.1
   ```
-  - `winglian/axolotl-runpod:main-py3.10-cu118-2.0.1`: for runpod
-  - `winglian/axolotl-runpod:main-py3.9-cu118-2.0.1-gptq`: for gptq
+  - `winglian/axolotl-runpod:main-latest`: for runpod or use this [direct link](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
 
   Or run on the current files for development:
 
@@ -104,19 +103,9 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \
 
   2. Install pytorch stable https://pytorch.org/get-started/locally/
 
-  3. Install python dependencies with ONE of the following:
-      - Recommended, supports QLoRA, NO gptq/int4 support
+  3. Install axolotl along with python dependencies
         ```bash
-        pip3 install -e .
-        pip3 install -U git+https://github.com/huggingface/peft.git
-        ```
-      - gptq/int4 support, NO QLoRA
-        ```bash
-        pip3 install -e .[gptq]
-        ```
-      - same as above but not recommended
-        ```bash
-        pip3 install -e .[gptq_triton]
+        pip3 install -e .[flash-attn]
         ```
 
 - LambdaLabs
@@ -151,10 +140,9 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \
   git clone https://github.com/OpenAccess-AI-Collective/axolotl
   cd axolotl
 
-  pip3 install -e . # change depend on needs
+  pip3 install -e .
   pip3 install protobuf==3.20.3
   pip3 install -U --ignore-installed requests Pillow psutil scipy
-  pip3 install git+https://github.com/huggingface/peft.git # not for gptq
   ```
 
   5. Set path
@@ -163,6 +151,8 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \
   ```
   </details>
 
+- Windows: Please use WSL or Docker!
+
 ### Dataset
 
 Axolotl supports a variety of dataset formats. Below are some of the formats you can use.
@@ -328,6 +318,15 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
       name: enron_emails
       type: completion # format from earlier
 
+  # huggingface repo with multiple named configurations/subsets
+  datasets:
+    - path: bigcode/commitpackft
+      name:
+        - ruby
+        - python
+        - typescript
+      type: ... # unimplemented custom format
+
   # local
   datasets:
     - path: data.jsonl # or json
@@ -407,6 +406,10 @@ fp16: true
 # Use CUDA tf32
 tf32: true # require >=ampere
 
+# No AMP (automatic mixed precision)
+bfloat16: true # require >=ampere
+float16: true
+
 # a list of one or more datasets to finetune the model with
 datasets:
   # hf dataset repo | "json" for local dataset, make sure to fill data_files
@@ -459,6 +462,9 @@ dataset_shard_idx:
 # the maximum length of an input to train with, this should typically be less than 2048
 # as most models have a token/context limit of 2048
 sequence_len: 2048
+# pad inputs so each step uses constant sized buffers
+# this will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
+pad_to_sequence_len:
 # max sequence length to concatenate training samples together up to
 # inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
 # FutureWarning: This will soon be DEPRECATED
@@ -493,6 +499,12 @@ lora_modules_to_save:
 lora_out_dir:
 lora_fan_in_fan_out: false
 
+# ReLoRA configuration
+# must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
+relora_steps: # number of steps per ReLoRA restart
+relora_warmup_steps: # number of per-restart warmup steps
+relora_cpu_offload: # true to perform lora weight merges on cpu during restarts, for modest gpu memory savings
+
 # wandb configuration if you're using it
 wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
 wandb_project: # your wandb project name
@@ -515,7 +527,7 @@ lr_quadratic_warmup:
 logging_steps:
 save_strategy: # set to `no` to skip checkpoint saves
 save_steps: # leave empty to save at each epoch
-eval_steps:
+eval_steps: # leave empty to eval at each epoch
 save_total_limit: # checkpoints saved at a time
 max_steps:
 
@@ -548,6 +560,30 @@ log_sweep_min_lr:
 log_sweep_max_lr:
 
 # specify optimizer
+# Valid values are driven by the Transformers OptimizerNames class, see:
+# https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
+#
+# Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
+# torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
+# in the examples/ for your model and fine-tuning use case.
+#
+# Valid values for 'optimizer' include:
+# - adamw_hf
+# - adamw_torch
+# - adamw_torch_fused
+# - adamw_torch_xla
+# - adamw_apex_fused
+# - adafactor
+# - adamw_anyprecision
+# - sgd
+# - adagrad
+# - adamw_bnb_8bit
+# - lion_8bit
+# - lion_32bit
+# - paged_adamw_32bit
+# - paged_adamw_8bit
+# - paged_lion_32bit
+# - paged_lion_8bit
 optimizer:
 # specify weight decay
 weight_decay:
@@ -601,12 +637,14 @@ fsdp_config:
 # Deepspeed config path
 deepspeed:
 
+# Advanced DDP Arguments
+ddp_timeout:
+ddp_bucket_cap_mb:
+ddp_broadcast_buffers:
+
 # Path to torch distx for optim 'adamw_anyprecision'
 torchdistx_path:
 
-# Set padding for data collator to 'longest'
-collator_pad_to_longest:
-
 # Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
 pretraining_dataset:
 
@@ -626,7 +664,7 @@ strict:
 
 Run
 ```bash
-accelerate launch scripts/finetune.py configs/your_config.yml
+accelerate launch scripts/finetune.py your_config.yml
 ```
 
 #### Multi-GPU
@@ -726,6 +764,10 @@ Try to turn off xformers.
 
 It's safe to ignore it.
 
+> NCCL Timeouts during training
+
+See the [NCCL](docs/nccl.md) guide.
+
 ## Need help? 🙋♂️
 
 Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we can help you

diff --git a/deepspeed/zero2.json b/deepspeed/zero2.json
@@ -0,0 +1,46 @@
+{
+    "zero_optimization": {
+      "stage": 2,
+      "offload_optimizer": {
+        "device": "cpu"
+      },
+      "contiguous_gradients": true,
+      "overlap_comm": true
+    },
+    "bf16": {
+      "enabled": "auto"
+    },
+    "fp16": {
+      "enabled": "auto",
+      "auto_cast": false,
+      "loss_scale": 0,
+      "initial_scale_power": 32,
+      "loss_scale_window": 1000,
+      "hysteresis": 2,
+      "min_loss_scale": 1
+    },
+    "optimizer": {
+      "type": "AdamW",
+      "params": {
+        "lr": "auto",
+        "betas": [
+          0.9,
+          0.999
+        ],
+        "eps": 1e-8,
+        "weight_decay": "auto"
+      }
+    },
+    "scheduler": {
+      "type": "WarmupDecayLR",
+      "params": {
+        "warmup_min_lr": "auto",
+        "warmup_max_lr": "auto",
+        "warmup_num_steps": "auto",
+        "total_num_steps": "auto"
+      }
+    },
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
diff --git a/deepspeed/zero3.json b/deepspeed/zero3.json
@@ -35,10 +35,7 @@
     "type": "AdamW",
     "params": {
       "lr": "auto",
-      "betas": [
-        0.9,
-        0.95
-      ],
+      "betas": "auto",
       "eps": 1e-8,
       "weight_decay": "auto"
     }

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -9,6 +9,11 @@ services:
       - ~/.cache/huggingface/:/root/.cache/huggingface/
     # set environment variables
     environment:
+      # Set environment variables
+      - GIT_AUTHOR_NAME=${GIT_AUTHOR_NAME}
+      - GIT_AUTHOR_EMAIL=${GIT_AUTHOR_EMAIL}
+      - GIT_COMMITTER_NAME=${GIT_COMMITTER_NAME}
+      - GIT_COMMITTER_EMAIL=${GIT_COMMITTER_EMAIL}
       - WANDB_API_KEY=${WANDB_API_KEY}
     deploy:
       resources:

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -11,7 +11,6 @@ RUN apt-get update && \
 
 WORKDIR /workspace
 
-RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main"
 RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN cd axolotl && \

diff --git a/docs/nccl.md b/docs/nccl.md
@@ -0,0 +1,46 @@
+# NCCL
+
+NVIDIA NCCL is a library to facilitate and optimize multi-GPU communication operations, such as broadcast, all-gather, reduce, all-reduce, etc. Broadly, NCCL configuration is highly environment-specific and is configured via several [environment variables](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html). A common NCCL-related problem occurs when a long-running operation times out causing the training process to abort:
+
+```text
+Watchdog caught collective operation timeout: WorkNCCL(SeqNum=42, OpType=ALLGATHER, Timeout(ms)=1800000) ran for 1806948 milliseconds before timing out.
+```
+
+Often, this timeout will happen after 30 minutes (the default setting) and is accompanied by below-average power consumption with near 100% GPU utilization before the error is raised. Nvidia recommends [disabling PCI access control services (ACS)](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html#pci-access-control-services-acs) as a possible solution if this is available to you.
+
+Forcing cross-GPU communication via [NVLink](https://en.wikipedia.org/wiki/NVLink) may help without increasing timeouts. To verify that your configuration is leveraging NVLink run the following command:
+
+```shell
+nvidia-smi nvlink --status
+```
+
+To force NCCL to use NVLink, simply set this in the environment:
+
+```shell
+export NCCL_P2P_LEVEL=NVL
+```
+
+If NVLink is not available in your environment there are other options for ``NCCL_P2P_LEVEL`` in the table below:
+
+| NCCL_P2P_LEVEL | Description |
+| -------------- | ----------- |
+| PIX | P2P data transfers through no more than a single PCIe bridge. Faster data transfer rates vs to paths involving multiple bridges, but slower compared to direct GPU-to-GPU communication. |
+| PXB | P2P data transfers through multiple PCIe bridges but not going through the PCIe Host Bridge; this path involves a complex routing process, potentially incurring a moderate level of latency. |
+| PHB | P2P data transfers occur over the PCIe and through a PCIe Host Bridge, typically involving the CPU, which can facilitate direct memory access but might introduce additional latency compared to more direct paths (ex PIX, NVL) |
+
+To validate that acceptable data transfer speeds exist for your training job, running [NCCL Tests](https://github.com/NVIDIA/nccl-tests/blob/master/README.md) can help pinpoint bottlenecks, for example:
+
+```shell
+./build/all_reduce_perf -b 8 -e 128M -f 2 -g 3
+```
+
+It can be useful when debugging NCCL communication timeouts to activate additional logging in both PyTorch and NCCL:
+
+```shell
+export NCCL_DEBUG=INFO
+export NCCL_DEBUG_SUBSYS=ALL
+export TORCH_DISTRIBUTED_DEBUG=INFO
+export TORCHELASTIC_ERROR_FILE=/PATH/TO/torcherror.log
+```
+
+Finally, if you believe your training job needs more time you can increase the timeout past 30 minutes by setting the ``ddp_timeout`` value in the Axolotl configuration. See [PyTorch init_process_group](https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) for documentation on this value.