Merge branch 'main' into fix/tokenizer_cache_name

axolotl-ai-cloud · Mar 25, 2024 · 1d347d9 · 1d347d9
2 parents 28d7777 + f1ebaa0
commit 1d347d9
Show file tree

Hide file tree

Showing 46 changed files with 547 additions and 1,439 deletions.
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -0,0 +1,31 @@
+name: Publish Docs
+on:
+  push:
+    branches:
+      - main
+
+permissions:
+    contents: write
+    pages: write
+
+jobs:
+    build-deploy:
+        runs-on: ubuntu-latest
+        steps:
+        - name: Check out repository
+          uses: actions/checkout@v4
+        - name: Set up Quarto
+          uses: quarto-dev/quarto-actions/setup@v2
+        - name: Setup Python
+          uses: actions/setup-python@v3
+          with:
+            python-version: '3.10'
+        - name: install dependencies
+          run: |
+            python3 -m pip install jupyter
+        - name: Publish to GitHub Pages (and render)
+          uses: quarto-dev/quarto-actions/publish@v2
+          with:
+            target: gh-pages
+          env:
+            GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -34,7 +34,7 @@ jobs:
       fail-fast: false
       matrix:
         python_version: ["3.10", "3.11"]
-    timeout-minutes: 10
+    timeout-minutes: 20
 
     steps:
       - name: Check out repository code

diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 configs
 last_run_prepared/
 .vscode
+_site/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -172,3 +173,5 @@ wandb
 lora-out/*
 qlora-out/*
 mlruns/*
+
+/.quarto/
diff --git a/README.md b/README.md
@@ -32,6 +32,7 @@ Features:
   - [Bare Metal Cloud GPU](#bare-metal-cloud-gpu)
   - [Windows](#windows)
   - [Mac](#mac)
+  - [Google Colab](#google-colab)
   - [Launching on public clouds via SkyPilot](#launching-on-public-clouds-via-skypilot)
 - [Dataset](#dataset)
   - [How to Add Custom Prompts](#how-to-add-custom-prompts)
@@ -149,7 +150,7 @@ accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/OpenAcc
   ```
 
 >[!Tip]
-> If you want to debug axolotl or prefer to use Docker as your development environment, see the [debugging guide's section on Docker](docs/debugging.md#debugging-with-docker).
+> If you want to debug axolotl or prefer to use Docker as your development environment, see the [debugging guide's section on Docker](docs/debugging.qmd#debugging-with-docker).
 
   <details>
 
@@ -267,7 +268,11 @@ Use the below instead of the install method in QuickStart.
 ```
 pip3 install -e '.'
 ```
-More info: [mac.md](/docs/mac.md)
+More info: [mac.md](/docs/mac.qmd)
+
+#### Google Colab
+
+Please use this example [notebook](examples/colab-notebooks/colab-axolotl-example.ipynb).
 
 #### Launching on public clouds via SkyPilot
 To launch on GPU instances (both on-demand and spot instances) on 7+ clouds (GCP, AWS, Azure, OCI, and more), you can use [SkyPilot](https://skypilot.readthedocs.io/en/latest/index.html):
@@ -409,7 +414,7 @@ pretraining_dataset: # hf path only
    {"segments": [{"label": true|false, "text": "..."}]}
   ```
 
-This is a special format that allows you to construct prompts without using templates. This is for advanced users who want more freedom with prompt construction.  See [these docs](docs/input_output.md) for more details.
+This is a special format that allows you to construct prompts without using templates. This is for advanced users who want more freedom with prompt construction.  See [these docs](docs/input_output.qmd) for more details.
 
 ##### Conversation
 
@@ -651,9 +656,13 @@ datasets:
     train_on_split: train # Optional[str] name of dataset split to load from
 
     # Optional[str] fastchat conversation type, only used with type: sharegpt
-    conversation:  # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+    conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
     field_human: # Optional[str]. Human key to use for conversation.
     field_model: # Optional[str]. Assistant key to use for conversation.
+    # Add additional keys from your dataset as input or output roles
+    roles:
+      input: # Optional[List[str]]. These will be masked based on train_on_input
+      output: # Optional[List[str]].
 
   # Custom user instruction prompt
   - path: repo
@@ -678,6 +687,10 @@ datasets:
       # For `completion` datsets only, uses the provided field instead of `text` column
       field:
 
+# If false, the datasets will not be shuffled and will keep their original order in `datasets`.
+# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
+shuffle_merged_datasets: true
+
 # A list of one or more datasets to eval the model with.
 # You can use either test_datasets, or val_set_size, but not both.
 test_datasets:
@@ -899,7 +912,26 @@ lr_div_factor: # Learning rate div factor
 # - paged_adamw_8bit
 # - paged_lion_32bit
 # - paged_lion_8bit
+# - galore_adamw
+# - galore_adamw_8bit
+# - galore_adafactor
+# - galore_adamw_layerwise
+# - galore_adamw_8bit_layerwise
+# - galore_adafactor_layerwise
 optimizer:
+# Dictionary of arguments to pass to the optimizer
+optim_args:
+# For Galore Optimizers the following optim_args are available
+# rank:  # type: int
+# update_proj_gap  # type: int
+# scale  # type: float
+# proj_type:  # type: str, default = std
+
+# The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm
+optim_target_modules:
+# - self_attn  # for llama
+# - mlp
+
 # Specify weight decay
 weight_decay:
 # adamw hyperparams
@@ -1098,7 +1130,7 @@ fsdp_config:
 
 ##### FSDP + QLoRA
 
-Axolotl supports training with FSDP and QLoRA, see [these docs](docs/fsdp_qlora.md) for more information.
+Axolotl supports training with FSDP and QLoRA, see [these docs](docs/fsdp_qlora.qmd) for more information.
 
 ##### Weights & Biases Logging
 
@@ -1177,7 +1209,7 @@ although this will be very slow, and using the config options above are recommen
 
 ## Common Errors 🧰
 
-See also the [FAQ's](./docs/faq.md) and [debugging guide](docs/debugging.md).
+See also the [FAQ's](./docs/faq.qmd) and [debugging guide](docs/debugging.qmd).
 
 > If you encounter a 'Cuda out of memory' error, it means your GPU ran out of memory during the training process. Here's how to resolve it:
 
@@ -1211,7 +1243,7 @@ It's safe to ignore it.
 
 > NCCL Timeouts during training
 
-See the [NCCL](docs/nccl.md) guide.
+See the [NCCL](docs/nccl.qmd) guide.
 
 
 ### Tokenization Mismatch b/w Inference & Training
@@ -1229,7 +1261,7 @@ Having misalignment between your prompts during training and inference can cause
 
 ## Debugging Axolotl
 
-See [this debugging guide](docs/debugging.md) for tips on debugging Axolotl, along with an example configuration for debugging with VSCode.
+See [this debugging guide](docs/debugging.qmd) for tips on debugging Axolotl, along with an example configuration for debugging with VSCode.
 
 ## Need help? 🙋
 

diff --git a/_quarto.yml b/_quarto.yml
@@ -0,0 +1,51 @@
+project:
+  type: website
+
+website:
+  title: "Axolotl"
+  description: "Fine-tuning"
+  favicon: favicon.jpg
+  navbar:
+    title: Axolotl
+    background: dark
+    pinned: false
+    collapse: false
+    tools:
+    - icon: twitter
+      href: https://twitter.com/axolotl_ai
+    - icon: github
+      href: https://github.com/OpenAccess-AI-Collective/axolotl/
+    - icon: discord
+      href: https://discord.gg/7m9sfhzaf3
+
+  sidebar:
+      pinned: true
+      collapse-level: 2
+      style: docked
+      contents:
+        - text: Home
+          href: index.qmd
+        - section: "How-To Guides"
+          contents:
+          # TODO Edit folder structure after we have more docs.
+            - docs/debugging.qmd
+            - docs/multipack.qmd
+            - docs/fdsp_qlora.qmd
+            - docs/input_output.qmd
+            - docs/rlhf.qmd
+            - docs/nccl.qmd
+            - docs/mac.qmd
+            - docs/multi-node.qmd
+        - section: "Reference"
+          contents:
+            - docs/config.qmd
+        - docs/faq.qmd
+
+
+
+
+format:
+  html:
+    theme: materia
+    css: styles.css
+    toc: true
diff --git a/cicd/Dockerfile.jinja b/cicd/Dockerfile.jinja
@@ -23,9 +23,9 @@ RUN git fetch origin +$GITHUB_REF && \
 
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
     else \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore] $AXOLOTL_ARGS; \
     fi
 
 # So we can test the Docker image

diff --git a/devtools/README.md b/devtools/README.md
@@ -1 +1 @@
-This directory contains example config files that might be useful for debugging. Please see [docs/debugging.md](../docs/debugging.md) for more information.
+This directory contains example config files that might be useful for debugging. Please see [docs/debugging.qmd](../docs/debugging.qmd) for more information.
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -21,9 +21,9 @@ WORKDIR /workspace/axolotl
 
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
     else \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore] $AXOLOTL_ARGS; \
     fi
 
 # So we can test the Docker image

diff --git a/docs/.gitignore b/docs/.gitignore
@@ -0,0 +1,2 @@
+/.quarto/
+_site/
diff --git a/docs/config.qmd b/docs/config.qmd
@@ -0,0 +1,17 @@
+---
+title: Config options
+description: A complete list of all configuration options.
+---
+
+```{python}
+#|echo: false
+#|output: asis
+import re
+# Regex pattern to match the YAML block including its code fence
+pattern = r'<details[^>]*id="all-yaml-options"[^>]*>.*?<summary>All yaml options.*?```yaml(.*?)```.*?</details>'
+
+with open('../README.md', 'r') as f:
+    doc = f.read()
+match = re.search(pattern, doc, re.DOTALL)
+print("```yaml", match.group(1).strip(), "```", sep="\n")
+```
diff --git a/docs/debugging.md → docs/debugging.qmd b/docs/debugging.md → docs/debugging.qmd
@@ -1,4 +1,8 @@
-# Debugging Axolotl
+---
+title: Debugging
+description: How to debug Axolotl
+---
+
 
 This document provides some tips and tricks for debugging Axolotl.  It also provides an example configuration for debugging with VSCode.  A good debugging setup is essential to understanding how Axolotl code works behind the scenes.
 

diff --git a/docs/faq.md b/docs/faq.md
diff --git a/docs/faq.qmd b/docs/faq.qmd
@@ -0,0 +1,21 @@
+---
+title: FAQ
+description: Frequently asked questions
+---
+
+
+**Q: The trainer stopped and hasn't progressed in several minutes.**
+
+> A: Usually an issue with the GPUs communicating with each other. See the [NCCL doc](nccl.qmd)
+
+**Q: Exitcode -9**
+
+> A: This usually happens when you run out of system RAM.
+
+**Q: Exitcode -7 while using deepspeed**
+
+> A: Try upgrading deepspeed w: `pip install -U deepspeed`
+
+**Q: AttributeError: 'DummyOptim' object has no attribute 'step'**
+
+> A: You may be using deepspeed with single gpu. Please don't set `deepspeed:` in yaml or cli.
diff --git a/docs/fsdp_qlora.md → docs/fsdp_qlora.qmd b/docs/fsdp_qlora.md → docs/fsdp_qlora.qmd
@@ -1,4 +1,10 @@
-# FDSP + QLoRA
+---
+title: FDSP + QLoRA
+description: Use FSDP with QLoRA to fine-tune large LLMs on consumer GPUs.
+format:
+  html:
+    toc: true
+---
 
 ## Background
 

diff --git a/docs/input_output.md → docs/input_output.qmd b/docs/input_output.md → docs/input_output.qmd
@@ -1,4 +1,7 @@
-# Template-free prompt construction with the `input_output` format
+---
+title: Template-free prompt construction
+description: "Template-free prompt construction with the `input_output` format"
+---
 
 <!-- TOC -->
 

diff --git a/docs/mac.md → docs/mac.qmd b/docs/mac.md → docs/mac.qmd
@@ -1,8 +1,12 @@
-# Mac M series support
+---
+title: Mac M-series
+description: Mac M-series support
+---
 
 Currently Axolotl on Mac is partially usable, many of the dependencies of Axolotl including Pytorch do not support MPS or have incomplete support.
 
 Current support:
+
 - [x] Support for all models
 - [x] Full training of models
 - [x] LoRA training

diff --git a/docs/multi-node.md → docs/multi-node.qmd b/docs/multi-node.md → docs/multi-node.qmd
@@ -1,4 +1,7 @@
-# Multi Node
+---
+title: Multi Node
+description: How to use Axolotl on multiple machines
+---
 
 You will need to create a configuration for accelerate, either by using `accelerate config` and follow the instructions or you can use one of the preset below:
 

diff --git a/docs/multipack.md → docs/multipack.qmd b/docs/multipack.md → docs/multipack.qmd
@@ -1,4 +1,7 @@
-# Multipack (Sample Packing)
+---
+title: Multipack (Sample Packing)
+description: Multipack is a technique to pack multiple sequences into a single batch to increase training throughput.
+---
 
 ## Visualization of Multipack with Flash Attention
 

diff --git a/docs/nccl.md → docs/nccl.qmd b/docs/nccl.md → docs/nccl.qmd
@@ -1,4 +1,7 @@
-# NCCL
+---
+title: NCCL
+description: Troubleshooting NCCL issues
+---
 
 NVIDIA NCCL is a library to facilitate and optimize multi-GPU communication operations, such as broadcast, all-gather, reduce, all-reduce, etc. Broadly, NCCL configuration is highly environment-specific and is configured via several [environment variables](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html). A common NCCL-related problem occurs when a long-running operation times out causing the training process to abort:
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		This directory contains example config files that might be useful for debugging. Please see [docs/debugging.md](../docs/debugging.md) for more information.
		This directory contains example config files that might be useful for debugging. Please see [docs/debugging.qmd](../docs/debugging.qmd) for more information.