Merge branch 'main' of https://github.com/Y-IAB/axolotl into checkpoi…

…nt-callback
Y-IAB · Mar 29, 2024 · 0b76c97 · 0b76c97
2 parents b917faa + e0ed008
commit 0b76c97
Show file tree

Hide file tree

Showing 43 changed files with 403 additions and 1,454 deletions.
diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml
@@ -16,17 +16,22 @@ jobs:
             cuda_version: 11.8.0
             python_version: "3.10"
             pytorch: 2.1.2
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
           - cuda: "121"
             cuda_version: 12.1.0
             python_version: "3.10"
             pytorch: 2.1.2
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
           - cuda: "121"
             cuda_version: 12.1.0
             python_version: "3.11"
             pytorch: 2.1.2
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+          - cuda: "121"
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.2.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
     steps:
       - name: Checkout
         uses: actions/checkout@v3

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -0,0 +1,31 @@
+name: Publish Docs
+on:
+  push:
+    branches:
+      - main
+
+permissions:
+    contents: write
+    pages: write
+
+jobs:
+    build-deploy:
+        runs-on: ubuntu-latest
+        steps:
+        - name: Check out repository
+          uses: actions/checkout@v4
+        - name: Set up Quarto
+          uses: quarto-dev/quarto-actions/setup@v2
+        - name: Setup Python
+          uses: actions/setup-python@v3
+          with:
+            python-version: '3.10'
+        - name: install dependencies
+          run: |
+            python3 -m pip install jupyter
+        - name: Publish to GitHub Pages (and render)
+          uses: quarto-dev/quarto-actions/publish@v2
+          with:
+            target: gh-pages
+          env:
+            GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 configs
 last_run_prepared/
 .vscode
+_site/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -172,3 +173,5 @@ wandb
 lora-out/*
 qlora-out/*
 mlruns/*
+
+/.quarto/
diff --git a/README.md b/README.md
@@ -32,6 +32,7 @@ Features:
   - [Bare Metal Cloud GPU](#bare-metal-cloud-gpu)
   - [Windows](#windows)
   - [Mac](#mac)
+  - [Google Colab](#google-colab)
   - [Launching on public clouds via SkyPilot](#launching-on-public-clouds-via-skypilot)
 - [Dataset](#dataset)
   - [How to Add Custom Prompts](#how-to-add-custom-prompts)
@@ -42,8 +43,8 @@ Features:
   - [Merge LORA to Base](#merge-lora-to-base)
   - [Special Tokens](#special-tokens)
 - Advanced Topics
-  - [Multipack](./docs/multipack.md)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
-  - [RLHF & DPO](./docs/rlhf.md)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
+  - [Multipack](./docs/multipack.qmd)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
+  - [RLHF & DPO](./docs/rlhf.qmd)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
 - [Common Errors](#common-errors-)
   - [Tokenization Mismatch b/w Training & Inference](#tokenization-mismatch-bw-inference--training)
 - [Debugging Axolotl](#debugging-axolotl)
@@ -149,7 +150,7 @@ accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/OpenAcc
   ```
 
 >[!Tip]
-> If you want to debug axolotl or prefer to use Docker as your development environment, see the [debugging guide's section on Docker](docs/debugging.md#debugging-with-docker).
+> If you want to debug axolotl or prefer to use Docker as your development environment, see the [debugging guide's section on Docker](docs/debugging.qmd#debugging-with-docker).
 
   <details>
 
@@ -267,7 +268,11 @@ Use the below instead of the install method in QuickStart.
 ```
 pip3 install -e '.'
 ```
-More info: [mac.md](/docs/mac.md)
+More info: [mac.md](/docs/mac.qmd)
+
+#### Google Colab
+
+Please use this example [notebook](examples/colab-notebooks/colab-axolotl-example.ipynb).
 
 #### Launching on public clouds via SkyPilot
 To launch on GPU instances (both on-demand and spot instances) on 7+ clouds (GCP, AWS, Azure, OCI, and more), you can use [SkyPilot](https://skypilot.readthedocs.io/en/latest/index.html):
@@ -409,7 +414,7 @@ pretraining_dataset: # hf path only
    {"segments": [{"label": true|false, "text": "..."}]}
   ```
 
-This is a special format that allows you to construct prompts without using templates. This is for advanced users who want more freedom with prompt construction.  See [these docs](docs/input_output.md) for more details.
+This is a special format that allows you to construct prompts without using templates. This is for advanced users who want more freedom with prompt construction.  See [these docs](docs/input_output.qmd) for more details.
 
 ##### Conversation
 
@@ -1130,7 +1135,7 @@ fsdp_config:
 
 ##### FSDP + QLoRA
 
-Axolotl supports training with FSDP and QLoRA, see [these docs](docs/fsdp_qlora.md) for more information.
+Axolotl supports training with FSDP and QLoRA, see [these docs](docs/fsdp_qlora.qmd) for more information.
 
 ##### Weights & Biases Logging
 
@@ -1209,7 +1214,7 @@ although this will be very slow, and using the config options above are recommen
 
 ## Common Errors 🧰
 
-See also the [FAQ's](./docs/faq.md) and [debugging guide](docs/debugging.md).
+See also the [FAQ's](./docs/faq.qmd) and [debugging guide](docs/debugging.qmd).
 
 > If you encounter a 'Cuda out of memory' error, it means your GPU ran out of memory during the training process. Here's how to resolve it:
 
@@ -1243,7 +1248,7 @@ It's safe to ignore it.
 
 > NCCL Timeouts during training
 
-See the [NCCL](docs/nccl.md) guide.
+See the [NCCL](docs/nccl.qmd) guide.
 
 
 ### Tokenization Mismatch b/w Inference & Training
@@ -1261,7 +1266,7 @@ Having misalignment between your prompts during training and inference can cause
 
 ## Debugging Axolotl
 
-See [this debugging guide](docs/debugging.md) for tips on debugging Axolotl, along with an example configuration for debugging with VSCode.
+See [this debugging guide](docs/debugging.qmd) for tips on debugging Axolotl, along with an example configuration for debugging with VSCode.
 
 ## Need help? 🙋
 

diff --git a/_quarto.yml b/_quarto.yml
@@ -0,0 +1,51 @@
+project:
+  type: website
+
+website:
+  title: "Axolotl"
+  description: "Fine-tuning"
+  favicon: favicon.jpg
+  navbar:
+    title: Axolotl
+    background: dark
+    pinned: false
+    collapse: false
+    tools:
+    - icon: twitter
+      href: https://twitter.com/axolotl_ai
+    - icon: github
+      href: https://github.com/OpenAccess-AI-Collective/axolotl/
+    - icon: discord
+      href: https://discord.gg/7m9sfhzaf3
+
+  sidebar:
+      pinned: true
+      collapse-level: 2
+      style: docked
+      contents:
+        - text: Home
+          href: index.qmd
+        - section: "How-To Guides"
+          contents:
+          # TODO Edit folder structure after we have more docs.
+            - docs/debugging.qmd
+            - docs/multipack.qmd
+            - docs/fdsp_qlora.qmd
+            - docs/input_output.qmd
+            - docs/rlhf.qmd
+            - docs/nccl.qmd
+            - docs/mac.qmd
+            - docs/multi-node.qmd
+        - section: "Reference"
+          contents:
+            - docs/config.qmd
+        - docs/faq.qmd
+
+
+
+
+format:
+  html:
+    theme: materia
+    css: styles.css
+    toc: true
diff --git a/devtools/README.md b/devtools/README.md
@@ -1 +1 @@
-This directory contains example config files that might be useful for debugging. Please see [docs/debugging.md](../docs/debugging.md) for more information.
+This directory contains example config files that might be useful for debugging. Please see [docs/debugging.qmd](../docs/debugging.qmd) for more information.
diff --git a/docs/.gitignore b/docs/.gitignore
@@ -0,0 +1,2 @@
+/.quarto/
+_site/
diff --git a/docs/config.qmd b/docs/config.qmd
@@ -0,0 +1,17 @@
+---
+title: Config options
+description: A complete list of all configuration options.
+---
+
+```{python}
+#|echo: false
+#|output: asis
+import re
+# Regex pattern to match the YAML block including its code fence
+pattern = r'<details[^>]*id="all-yaml-options"[^>]*>.*?<summary>All yaml options.*?```yaml(.*?)```.*?</details>'
+
+with open('../README.md', 'r') as f:
+    doc = f.read()
+match = re.search(pattern, doc, re.DOTALL)
+print("```yaml", match.group(1).strip(), "```", sep="\n")
+```
diff --git a/docs/debugging.md → docs/debugging.qmd b/docs/debugging.md → docs/debugging.qmd
@@ -1,4 +1,8 @@
-# Debugging Axolotl
+---
+title: Debugging
+description: How to debug Axolotl
+---
+
 
 This document provides some tips and tricks for debugging Axolotl.  It also provides an example configuration for debugging with VSCode.  A good debugging setup is essential to understanding how Axolotl code works behind the scenes.
 

diff --git a/docs/faq.md b/docs/faq.md
diff --git a/docs/faq.qmd b/docs/faq.qmd
@@ -0,0 +1,21 @@
+---
+title: FAQ
+description: Frequently asked questions
+---
+
+
+**Q: The trainer stopped and hasn't progressed in several minutes.**
+
+> A: Usually an issue with the GPUs communicating with each other. See the [NCCL doc](nccl.qmd)
+
+**Q: Exitcode -9**
+
+> A: This usually happens when you run out of system RAM.
+
+**Q: Exitcode -7 while using deepspeed**
+
+> A: Try upgrading deepspeed w: `pip install -U deepspeed`
+
+**Q: AttributeError: 'DummyOptim' object has no attribute 'step'**
+
+> A: You may be using deepspeed with single gpu. Please don't set `deepspeed:` in yaml or cli.
diff --git a/docs/fsdp_qlora.md → docs/fsdp_qlora.qmd b/docs/fsdp_qlora.md → docs/fsdp_qlora.qmd
@@ -1,4 +1,10 @@
-# FDSP + QLoRA
+---
+title: FDSP + QLoRA
+description: Use FSDP with QLoRA to fine-tune large LLMs on consumer GPUs.
+format:
+  html:
+    toc: true
+---
 
 ## Background
 

diff --git a/docs/input_output.md → docs/input_output.qmd b/docs/input_output.md → docs/input_output.qmd
@@ -1,4 +1,7 @@
-# Template-free prompt construction with the `input_output` format
+---
+title: Template-free prompt construction
+description: "Template-free prompt construction with the `input_output` format"
+---
 
 <!-- TOC -->
 

diff --git a/docs/mac.md → docs/mac.qmd b/docs/mac.md → docs/mac.qmd
@@ -1,8 +1,12 @@
-# Mac M series support
+---
+title: Mac M-series
+description: Mac M-series support
+---
 
 Currently Axolotl on Mac is partially usable, many of the dependencies of Axolotl including Pytorch do not support MPS or have incomplete support.
 
 Current support:
+
 - [x] Support for all models
 - [x] Full training of models
 - [x] LoRA training

diff --git a/docs/multi-node.md → docs/multi-node.qmd b/docs/multi-node.md → docs/multi-node.qmd
@@ -1,4 +1,7 @@
-# Multi Node
+---
+title: Multi Node
+description: How to use Axolotl on multiple machines
+---
 
 You will need to create a configuration for accelerate, either by using `accelerate config` and follow the instructions or you can use one of the preset below:
 

diff --git a/docs/multipack.md → docs/multipack.qmd b/docs/multipack.md → docs/multipack.qmd
@@ -1,4 +1,7 @@
-# Multipack (Sample Packing)
+---
+title: Multipack (Sample Packing)
+description: Multipack is a technique to pack multiple sequences into a single batch to increase training throughput.
+---
 
 ## Visualization of Multipack with Flash Attention
 

diff --git a/docs/nccl.md → docs/nccl.qmd b/docs/nccl.md → docs/nccl.qmd
@@ -1,4 +1,7 @@
-# NCCL
+---
+title: NCCL
+description: Troubleshooting NCCL issues
+---
 
 NVIDIA NCCL is a library to facilitate and optimize multi-GPU communication operations, such as broadcast, all-gather, reduce, all-reduce, etc. Broadly, NCCL configuration is highly environment-specific and is configured via several [environment variables](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html). A common NCCL-related problem occurs when a long-running operation times out causing the training process to abort:
 

diff --git a/docs/rlhf.md → docs/rlhf.qmd b/docs/rlhf.md → docs/rlhf.qmd
@@ -1,4 +1,7 @@
-# RLHF (Beta)
+---
+title: "RLHF (Beta)"
+description: "Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human feedback."
+---
 
 ### Overview
 

diff --git a/examples/gemma/qlora.yml b/examples/gemma/qlora.yml
@@ -21,7 +21,8 @@ lora_dropout: 0.05
 lora_target_linear: true
 
 sequence_len: 4096
-sample_packing: false
+sample_packing: true
+eval_sample_packing: false
 pad_to_sequence_len: true
 
 wandb_project:

diff --git a/examples/llama-2/qlora-fsdp.yml b/examples/llama-2/qlora-fsdp.yml
@@ -36,7 +36,7 @@ wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 4
 num_epochs: 4
-optimizer: paged_adamw_8bit
+optimizer: adamw_torch
 lr_scheduler: cosine
 learning_rate: 0.00001
 
@@ -66,5 +66,11 @@ weight_decay: 0.0
 fsdp:
   - full_shard
 fsdp_config:
+  fsdp_limit_all_gathers: true
+  fsdp_sync_module_states: true
+  fsdp_offload_params: true
+  fsdp_use_orig_params: false
+  fsdp_cpu_ram_efficient_loading: true
   fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  fsdp_state_dict_type: SHARDED_STATE_DICT
 special_tokens:
diff --git a/examples/mistral/Mistral-7b-example/README.md b/examples/mistral/Mistral-7b-example/README.md
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		This directory contains example config files that might be useful for debugging. Please see [docs/debugging.md](../docs/debugging.md) for more information.
		This directory contains example config files that might be useful for debugging. Please see [docs/debugging.qmd](../docs/debugging.qmd) for more information.