axolotl-ai-cloud · hamelsmu · Apr 1, 2024 · Mar 31, 2024 · Mar 31, 2024 · Mar 31, 2024
diff --git a/README.md b/README.md
diff --git a/_quarto.yml b/_quarto.yml
@@ -30,20 +30,20 @@ website:
           # TODO Edit folder structure after we have more docs.
             - docs/debugging.qmd
             - docs/multipack.qmd
-            - docs/fdsp_qlora.qmd
+            - docs/fsdp_qlora.qmd
             - docs/input_output.qmd
             - docs/rlhf.qmd
             - docs/nccl.qmd
             - docs/mac.qmd
             - docs/multi-node.qmd
+        - section: "Dataset Formats"
+          contents: docs/dataset-formats/*
         - section: "Reference"
           contents:
             - docs/config.qmd
         - docs/faq.qmd
 
 
-
-
 format:
   html:
     theme: materia

diff --git a/docs/config.qmd b/docs/config.qmd
diff --git a/docs/dataset-formats/conversation.qmd b/docs/dataset-formats/conversation.qmd
@@ -0,0 +1,71 @@
+---
+title: Conversation
+description: Conversation format for supervised fine-tuning.
+order: 1
+---
+
+## Formats
+
+### sharegpt
+
+conversations where `from` is `human`/`gpt`. (optional: first row with role `system` to override default system prompt)
+
+```{.json filename="data.jsonl"}
+{"conversations": [{"from": "...", "value": "..."}]}
+```
+
+Note: `type: sharegpt` opens a special config `conversation:` that enables conversions to many Conversation types. See [the docs](../docs/config.qmd) for all config options.
+
+### pygmalion
+
+```{.json filename="data.jsonl"}
+{"conversations": [{"role": "...", "value": "..."}]}
+```
+
+### sharegpt.load_role
+
+conversations where `role` is used instead of `from`
+
+```{.json filename="data.jsonl"}
+{"conversations": [{"role": "...", "value": "..."}]}
+```
+
+### sharegpt.load_guanaco
+
+conversations where `from` is `prompter` `assistant` instead of default sharegpt
+
+```{.json filename="data.jsonl"}
+{"conversations": [{"from": "...", "value": "..."}]}
+```
+
+### sharegpt_jokes
+
+creates a chat where bot is asked to tell a joke, then explain why the joke is funny
+
+```{.json filename="data.jsonl"}
+{"conversations": [{"title": "...", "text": "...", "explanation": "..."}]}
+```
+
+## How to add custom prompts for instruction-tuning
+
+For a dataset that is preprocessed for instruction purposes:
+
+```{.json filename="data.jsonl"}
+{"input": "...", "output": "..."}
+```
+
+You can use this example in your YAML config:
+
+```{.yaml filename="config.yaml"}
+datasets:
+  - path: repo
+    type:
+      system_prompt: ""
+      field_system: system
+      field_instruction: input
+      field_output: output
+      format: "[INST] {instruction} [/INST]"
+      no_input_format: "[INST] {instruction} [/INST]"
+```
+
+See full config options under [here](../docs/config.qmd).
diff --git a/docs/dataset-formats/index.qmd b/docs/dataset-formats/index.qmd
@@ -0,0 +1,14 @@
+---
+title: Dataset Formats
+description: Supported dataset formats.
+listing:
+  fields: [title, description]
+  type: table
+  sort-ui: false
+  filter-ui: false
+  max-description-length: 250
+---
+
+Axolotl supports a variety of dataset formats.  It is recommended to use a JSONL format.  The schema of the JSONL depends upon the task and the prompt template you wish to use. Instead of a JSONL, you can also use a HuggingFace dataset with columns for each JSONL field.
+
+Below are these various formats organized by task:
diff --git a/docs/dataset-formats/inst_tune.qmd b/docs/dataset-formats/inst_tune.qmd
@@ -0,0 +1,165 @@
+---
+title: Instruction Tuning
+description: Instruction tuning formats for supervised fine-tuning.
+order: 2
+---
+
+## alpaca
+
+instruction; input(optional)
+
+```{.json filename="data.jsonl"}
+{"instruction": "...", "input": "...", "output": "..."}
+```
+
+## jeopardy
+
+question and answer
+
+```{.json filename="data.jsonl"}
+{"question": "...", "category": "...", "answer": "..."}
+```
+
+## oasst
+
+instruction
+
+```{.json filename="data.jsonl"}
+{"INSTRUCTION": "...", "RESPONSE": "..."}
+```
+
+## gpteacher
+
+instruction; input(optional)
+
+```{.json filename="data.jsonl"}
+{"instruction": "...", "input": "...", "response": "..."}
+```
+
+## reflection
+
+instruction with reflect; input(optional)
+
+```{.json filename="data.jsonl"}
+{"instruction": "...", "input": "...", "output": "...", "reflection": "...", "corrected": "..."}
+```
+
+## explainchoice
+
+question, choices, (solution OR explanation)
+
+```{.json filename="data.jsonl"}
+{"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."}
+```
+
+## concisechoice
+
+question, choices, (solution OR explanation)
+
+```{.json filename="data.jsonl"}
+{"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."}
+```
+
+## summarizetldr
+
+article and summary
+
+```{.json filename="data.jsonl"}
+{"article": "...", "summary": "..."}
+```
+
+## alpaca_chat
+
+basic instruct for alpaca chat
+
+```{.json filename="data.jsonl"}
+{"instruction": "...", "input": "...", "response": "..."}
+```
+
+## alpaca_chat.load_qa
+
+question and answer for alpaca chat
+
+```{.json filename="data.jsonl"}
+{"question": "...", "answer": "..."}
+```
+
+## alpaca_chat.load_concise
+
+question and answer for alpaca chat, for concise answers
+
+```{.json filename="data.jsonl"}
+{"instruction": "...", "input": "...", "response": "..."}
+```
+
+## alpaca_chat.load_camel_ai
+
+question and answer for alpaca chat, for load_camel_ai
+
+```{.json filename="data.jsonl"}
+{"message_1": "...", "message_2": "..."}
+```
+
+## alpaca_w_system.load_open_orca
+
+support for open orca datasets with included system prompts, instruct
+
+```{.json filename="data.jsonl"}
+{"system_prompt": "...", "question": "...", "response": "..."}
+```
+
+## context_qa
+
+in context question answering from an article
+
+```{.json filename="data.jsonl"}
+{"article": "...", "question": "...", "answer": "..."}
+```
+
+## context_qa.load_v2
+
+in context question answering (alternate)
+
+```{.json filename="data.jsonl"}
+{"context": "...", "question": "...", "answer": "..."}
+```
+
+## context_qa.load_404
+
+in context question answering from an article, with default response for no answer from context
+
+```{.json filename="data.jsonl"}
+{"article": "...", "unanswerable_question": "..."}
+```
+
+## creative_acr.load_answer
+
+instruction and revision
+
+```{.json filename="data.jsonl"}
+{"instruction": "...", "revision": "..."}
+```
+
+## creative_acr.load_critique
+
+critique
+
+```{.json filename="data.jsonl"}
+{"scores": "...", "critiques": "...", "instruction": "...", "answer": "..."}
+```
+
+## creative_acr.load_revise
+
+critique and revise
+
+```{.json filename="data.jsonl"}
+{"scores": "...", "critiques": "...", "instruction": "...", "answer": "...", "revision": "..."}
+```
+
+## metharme
+
+instruction, adds additional eos tokens
+
+```{.json filename="data.jsonl"}
+{"prompt": "...", "generation": "..."}
+```
diff --git a/docs/dataset-formats/pretraining.qmd b/docs/dataset-formats/pretraining.qmd
@@ -0,0 +1,26 @@
+---
+title: Pre-training
+description: Data format for a pre-training completion task.
+order: 3
+---
+
+For pretraining, there is no prompt template or roles.  The only required field is `text`:
+
+```{.json filename="data.jsonl"}
+{"text": "first row"}
+{"text": "second row"}
+...
+```
+
+:::{.callout-note}
+
+### Streaming is recommended for large datasets
+
+Axolotl usually loads the entire dataset into memory. This will be challenging for large datasets. Use the following config to enable streaming:
+
+```{.yaml filename="config.yaml"}
+pretraining_dataset: # hf path only
+...
+```
+
+:::
diff --git a/docs/dataset-formats/template_free.qmd b/docs/dataset-formats/template_free.qmd
@@ -0,0 +1,7 @@
+---
+title: Template-Free
+description: Construct prompts without a template.
+order: 4
+---
+
+See [these docs](../input_output.qmd).
diff --git a/docs/dataset-formats/tokenized.qmd b/docs/dataset-formats/tokenized.qmd
@@ -0,0 +1,12 @@
+---
+title: Custom Pre-Tokenized Dataset
+description: How to use a custom pre-tokenized dataset.
+order: 5
+---
+
+- Do not pass a `type:` in your axolotl config.
+- Columns in Dataset must be exactly `input_ids`, `attention_mask`, `labels`
+
+```{.yaml filename="config.yml"}
+- path: ...
+```
diff --git a/docs/fsdp_qlora.qmd b/docs/fsdp_qlora.qmd
@@ -1,5 +1,5 @@
 ---
-title: FDSP + QLoRA
+title: "FDSP + QLoRA"
 description: Use FSDP with QLoRA to fine-tune large LLMs on consumer GPUs.
 format:
   html:

diff --git a/docs/input_output.qmd b/docs/input_output.qmd
@@ -91,8 +91,9 @@ format into a jsonl file (below is the first row from the file
 
 ```bash
 $ head -n1 output.jsonl | python -m json.tool
+```
 
-{.cell-output .cell-output-stdout}
+:::{.cell-output .cell-output-stdout}
     {
         "segments": [
             {
@@ -113,7 +114,7 @@ $ head -n1 output.jsonl | python -m json.tool
             }
         ]
     }
-```
+:::
 
 Set `label:false` when you want to mask a segment of text so that the
 model isn't trained on it. Some things to keep in mind:
@@ -238,8 +239,9 @@ version is repeated below for reference):
 
 ```bash
 $ head -n1 output.jsonl | python -m json.tool
+```
 
-{.cell-output .cell-output-stdout}
+:::{.cell-output .cell-output-stdout}
     {
         "segments": [
             {
@@ -260,4 +262,4 @@ $ head -n1 output.jsonl | python -m json.tool
             }
         ]
     }
-```
+:::