diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000000..16cf774d71 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,31 @@ +name: Publish Docs +on: + push: + branches: + - main + +permissions: + contents: write + pages: write + +jobs: + build-deploy: + runs-on: ubuntu-latest + steps: + - name: Check out repository + uses: actions/checkout@v4 + - name: Set up Quarto + uses: quarto-dev/quarto-actions/setup@v2 + - name: Setup Python + uses: actions/setup-python@v3 + with: + python-version: '3.10' + - name: install dependencies + run: | + python3 -m pip install jupyter + - name: Publish to GitHub Pages (and render) + uses: quarto-dev/quarto-actions/publish@v2 + with: + target: gh-pages + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 684326064e..d8ca5400a9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -34,7 +34,7 @@ jobs: fail-fast: false matrix: python_version: ["3.10", "3.11"] - timeout-minutes: 10 + timeout-minutes: 20 steps: - name: Check out repository code diff --git a/.gitignore b/.gitignore index 9d6a103dab..589440abf6 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ configs last_run_prepared/ .vscode +_site/ # Byte-compiled / optimized / DLL files __pycache__/ @@ -172,3 +173,5 @@ wandb lora-out/* qlora-out/* mlruns/* + +/.quarto/ diff --git a/README.md b/README.md index 1629ae251d..4cd6fbda47 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ Features: - [Bare Metal Cloud GPU](#bare-metal-cloud-gpu) - [Windows](#windows) - [Mac](#mac) + - [Google Colab](#google-colab) - [Launching on public clouds via SkyPilot](#launching-on-public-clouds-via-skypilot) - [Dataset](#dataset) - [How to Add Custom Prompts](#how-to-add-custom-prompts) @@ -149,7 +150,7 @@ accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/OpenAcc ``` >[!Tip] -> If you want to debug axolotl or prefer to use Docker as your development environment, see the [debugging guide's section on Docker](docs/debugging.md#debugging-with-docker). +> If you want to debug axolotl or prefer to use Docker as your development environment, see the [debugging guide's section on Docker](docs/debugging.qmd#debugging-with-docker).
@@ -267,7 +268,11 @@ Use the below instead of the install method in QuickStart. ``` pip3 install -e '.' ``` -More info: [mac.md](/docs/mac.md) +More info: [mac.md](/docs/mac.qmd) + +#### Google Colab + +Please use this example [notebook](examples/colab-notebooks/colab-axolotl-example.ipynb). #### Launching on public clouds via SkyPilot To launch on GPU instances (both on-demand and spot instances) on 7+ clouds (GCP, AWS, Azure, OCI, and more), you can use [SkyPilot](https://skypilot.readthedocs.io/en/latest/index.html): @@ -409,7 +414,7 @@ pretraining_dataset: # hf path only {"segments": [{"label": true|false, "text": "..."}]} ``` -This is a special format that allows you to construct prompts without using templates. This is for advanced users who want more freedom with prompt construction. See [these docs](docs/input_output.md) for more details. +This is a special format that allows you to construct prompts without using templates. This is for advanced users who want more freedom with prompt construction. See [these docs](docs/input_output.qmd) for more details. ##### Conversation @@ -651,9 +656,13 @@ datasets: train_on_split: train # Optional[str] name of dataset split to load from # Optional[str] fastchat conversation type, only used with type: sharegpt - conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py + conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py field_human: # Optional[str]. Human key to use for conversation. field_model: # Optional[str]. Assistant key to use for conversation. + # Add additional keys from your dataset as input or output roles + roles: + input: # Optional[List[str]]. These will be masked based on train_on_input + output: # Optional[List[str]]. # Custom user instruction prompt - path: repo @@ -678,6 +687,10 @@ datasets: # For `completion` datsets only, uses the provided field instead of `text` column field: +# If false, the datasets will not be shuffled and will keep their original order in `datasets`. +# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true. +shuffle_merged_datasets: true + # A list of one or more datasets to eval the model with. # You can use either test_datasets, or val_set_size, but not both. test_datasets: @@ -899,7 +912,26 @@ lr_div_factor: # Learning rate div factor # - paged_adamw_8bit # - paged_lion_32bit # - paged_lion_8bit +# - galore_adamw +# - galore_adamw_8bit +# - galore_adafactor +# - galore_adamw_layerwise +# - galore_adamw_8bit_layerwise +# - galore_adafactor_layerwise optimizer: +# Dictionary of arguments to pass to the optimizer +optim_args: +# For Galore Optimizers the following optim_args are available +# rank: # type: int +# update_proj_gap # type: int +# scale # type: float +# proj_type: # type: str, default = std + +# The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm +optim_target_modules: +# - self_attn # for llama +# - mlp + # Specify weight decay weight_decay: # adamw hyperparams @@ -1098,7 +1130,7 @@ fsdp_config: ##### FSDP + QLoRA -Axolotl supports training with FSDP and QLoRA, see [these docs](docs/fsdp_qlora.md) for more information. +Axolotl supports training with FSDP and QLoRA, see [these docs](docs/fsdp_qlora.qmd) for more information. ##### Weights & Biases Logging @@ -1177,7 +1209,7 @@ although this will be very slow, and using the config options above are recommen ## Common Errors 🧰 -See also the [FAQ's](./docs/faq.md) and [debugging guide](docs/debugging.md). +See also the [FAQ's](./docs/faq.qmd) and [debugging guide](docs/debugging.qmd). > If you encounter a 'Cuda out of memory' error, it means your GPU ran out of memory during the training process. Here's how to resolve it: @@ -1211,7 +1243,7 @@ It's safe to ignore it. > NCCL Timeouts during training -See the [NCCL](docs/nccl.md) guide. +See the [NCCL](docs/nccl.qmd) guide. ### Tokenization Mismatch b/w Inference & Training @@ -1229,7 +1261,7 @@ Having misalignment between your prompts during training and inference can cause ## Debugging Axolotl -See [this debugging guide](docs/debugging.md) for tips on debugging Axolotl, along with an example configuration for debugging with VSCode. +See [this debugging guide](docs/debugging.qmd) for tips on debugging Axolotl, along with an example configuration for debugging with VSCode. ## Need help? πŸ™‹ diff --git a/_quarto.yml b/_quarto.yml new file mode 100644 index 0000000000..31aa90398e --- /dev/null +++ b/_quarto.yml @@ -0,0 +1,51 @@ +project: + type: website + +website: + title: "Axolotl" + description: "Fine-tuning" + favicon: favicon.jpg + navbar: + title: Axolotl + background: dark + pinned: false + collapse: false + tools: + - icon: twitter + href: https://twitter.com/axolotl_ai + - icon: github + href: https://github.com/OpenAccess-AI-Collective/axolotl/ + - icon: discord + href: https://discord.gg/7m9sfhzaf3 + + sidebar: + pinned: true + collapse-level: 2 + style: docked + contents: + - text: Home + href: index.qmd + - section: "How-To Guides" + contents: + # TODO Edit folder structure after we have more docs. + - docs/debugging.qmd + - docs/multipack.qmd + - docs/fdsp_qlora.qmd + - docs/input_output.qmd + - docs/rlhf.qmd + - docs/nccl.qmd + - docs/mac.qmd + - docs/multi-node.qmd + - section: "Reference" + contents: + - docs/config.qmd + - docs/faq.qmd + + + + +format: + html: + theme: materia + css: styles.css + toc: true diff --git a/cicd/Dockerfile.jinja b/cicd/Dockerfile.jinja index a4784707c5..19d7a09de5 100644 --- a/cicd/Dockerfile.jinja +++ b/cicd/Dockerfile.jinja @@ -23,9 +23,9 @@ RUN git fetch origin +$GITHUB_REF && \ # If AXOLOTL_EXTRAS is set, append it in brackets RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \ - pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \ + pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \ else \ - pip install -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \ + pip install -e .[deepspeed,flash-attn,mamba-ssm,galore] $AXOLOTL_ARGS; \ fi # So we can test the Docker image diff --git a/devtools/README.md b/devtools/README.md index 1d727ed8bb..0114ee3a80 100644 --- a/devtools/README.md +++ b/devtools/README.md @@ -1 +1 @@ -This directory contains example config files that might be useful for debugging. Please see [docs/debugging.md](../docs/debugging.md) for more information. +This directory contains example config files that might be useful for debugging. Please see [docs/debugging.qmd](../docs/debugging.qmd) for more information. diff --git a/docker/Dockerfile b/docker/Dockerfile index 62904af722..84046c6522 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -21,9 +21,9 @@ WORKDIR /workspace/axolotl # If AXOLOTL_EXTRAS is set, append it in brackets RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \ - pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \ + pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \ else \ - pip install -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \ + pip install -e .[deepspeed,flash-attn,mamba-ssm,galore] $AXOLOTL_ARGS; \ fi # So we can test the Docker image diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000000..4c23a061fa --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,2 @@ +/.quarto/ +_site/ diff --git a/docs/config.qmd b/docs/config.qmd new file mode 100644 index 0000000000..d93b170e7b --- /dev/null +++ b/docs/config.qmd @@ -0,0 +1,17 @@ +--- +title: Config options +description: A complete list of all configuration options. +--- + +```{python} +#|echo: false +#|output: asis +import re +# Regex pattern to match the YAML block including its code fence +pattern = r']*id="all-yaml-options"[^>]*>.*?All yaml options.*?```yaml(.*?)```.*?
' + +with open('../README.md', 'r') as f: + doc = f.read() +match = re.search(pattern, doc, re.DOTALL) +print("```yaml", match.group(1).strip(), "```", sep="\n") +``` diff --git a/docs/debugging.md b/docs/debugging.qmd similarity index 99% rename from docs/debugging.md rename to docs/debugging.qmd index 59df0b785b..7237fbd6f2 100644 --- a/docs/debugging.md +++ b/docs/debugging.qmd @@ -1,4 +1,8 @@ -# Debugging Axolotl +--- +title: Debugging +description: How to debug Axolotl +--- + This document provides some tips and tricks for debugging Axolotl. It also provides an example configuration for debugging with VSCode. A good debugging setup is essential to understanding how Axolotl code works behind the scenes. diff --git a/docs/faq.md b/docs/faq.md deleted file mode 100644 index 6542306538..0000000000 --- a/docs/faq.md +++ /dev/null @@ -1,18 +0,0 @@ -# Axolotl FAQ's - - -> The trainer stopped and hasn't progressed in several minutes. - -Usually an issue with the GPU's communicating with each other. See the [NCCL doc](../docs/nccl.md) - -> Exitcode -9 - -This usually happens when you run out of system RAM. - -> Exitcode -7 while using deepspeed - -Try upgrading deepspeed w: `pip install -U deepspeed` - -> AttributeError: 'DummyOptim' object has no attribute 'step' - -You may be using deepspeed with single gpu. Please don't set `deepspeed:` in yaml or cli. diff --git a/docs/faq.qmd b/docs/faq.qmd new file mode 100644 index 0000000000..91413d24e9 --- /dev/null +++ b/docs/faq.qmd @@ -0,0 +1,21 @@ +--- +title: FAQ +description: Frequently asked questions +--- + + +**Q: The trainer stopped and hasn't progressed in several minutes.** + +> A: Usually an issue with the GPUs communicating with each other. See the [NCCL doc](nccl.qmd) + +**Q: Exitcode -9** + +> A: This usually happens when you run out of system RAM. + +**Q: Exitcode -7 while using deepspeed** + +> A: Try upgrading deepspeed w: `pip install -U deepspeed` + +**Q: AttributeError: 'DummyOptim' object has no attribute 'step'** + +> A: You may be using deepspeed with single gpu. Please don't set `deepspeed:` in yaml or cli. diff --git a/docs/fsdp_qlora.md b/docs/fsdp_qlora.qmd similarity index 92% rename from docs/fsdp_qlora.md rename to docs/fsdp_qlora.qmd index 14b2c1a571..69b4ad4454 100644 --- a/docs/fsdp_qlora.md +++ b/docs/fsdp_qlora.qmd @@ -1,4 +1,10 @@ -# FDSP + QLoRA +--- +title: FDSP + QLoRA +description: Use FSDP with QLoRA to fine-tune large LLMs on consumer GPUs. +format: + html: + toc: true +--- ## Background diff --git a/docs/input_output.md b/docs/input_output.qmd similarity index 98% rename from docs/input_output.md rename to docs/input_output.qmd index dbc6979c6f..4e2ea1345f 100644 --- a/docs/input_output.md +++ b/docs/input_output.qmd @@ -1,4 +1,7 @@ -# Template-free prompt construction with the `input_output` format +--- +title: Template-free prompt construction +description: "Template-free prompt construction with the `input_output` format" +--- diff --git a/docs/mac.md b/docs/mac.qmd similarity index 89% rename from docs/mac.md rename to docs/mac.qmd index 59eacce6d0..2a83035381 100644 --- a/docs/mac.md +++ b/docs/mac.qmd @@ -1,8 +1,12 @@ -# Mac M series support +--- +title: Mac M-series +description: Mac M-series support +--- Currently Axolotl on Mac is partially usable, many of the dependencies of Axolotl including Pytorch do not support MPS or have incomplete support. Current support: + - [x] Support for all models - [x] Full training of models - [x] LoRA training diff --git a/docs/multi-node.md b/docs/multi-node.qmd similarity index 95% rename from docs/multi-node.md rename to docs/multi-node.qmd index 6806159690..5c6fa976b9 100644 --- a/docs/multi-node.md +++ b/docs/multi-node.qmd @@ -1,4 +1,7 @@ -# Multi Node +--- +title: Multi Node +description: How to use Axolotl on multiple machines +--- You will need to create a configuration for accelerate, either by using `accelerate config` and follow the instructions or you can use one of the preset below: diff --git a/docs/multipack.md b/docs/multipack.qmd similarity index 92% rename from docs/multipack.md rename to docs/multipack.qmd index bee13b62c3..097bcd2e50 100644 --- a/docs/multipack.md +++ b/docs/multipack.qmd @@ -1,4 +1,7 @@ -# Multipack (Sample Packing) +--- +title: Multipack (Sample Packing) +description: Multipack is a technique to pack multiple sequences into a single batch to increase training throughput. +--- ## Visualization of Multipack with Flash Attention diff --git a/docs/nccl.md b/docs/nccl.qmd similarity index 98% rename from docs/nccl.md rename to docs/nccl.qmd index 4a7ff5d5d6..3b616aa665 100644 --- a/docs/nccl.md +++ b/docs/nccl.qmd @@ -1,4 +1,7 @@ -# NCCL +--- +title: NCCL +description: Troubleshooting NCCL issues +--- NVIDIA NCCL is a library to facilitate and optimize multi-GPU communication operations, such as broadcast, all-gather, reduce, all-reduce, etc. Broadly, NCCL configuration is highly environment-specific and is configured via several [environment variables](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html). A common NCCL-related problem occurs when a long-running operation times out causing the training process to abort: diff --git a/docs/rlhf.md b/docs/rlhf.qmd similarity index 90% rename from docs/rlhf.md rename to docs/rlhf.qmd index 4f71184fc0..7db68915ad 100644 --- a/docs/rlhf.md +++ b/docs/rlhf.qmd @@ -1,4 +1,7 @@ -# RLHF (Beta) +--- +title: "RLHF (Beta)" +description: "Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human feedback." +--- ### Overview diff --git a/examples/llama-2/qlora-fsdp.yml b/examples/llama-2/qlora-fsdp.yml index da6c06020f..30916ed45a 100644 --- a/examples/llama-2/qlora-fsdp.yml +++ b/examples/llama-2/qlora-fsdp.yml @@ -36,7 +36,7 @@ wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 4 num_epochs: 4 -optimizer: paged_adamw_8bit +optimizer: adamw_torch lr_scheduler: cosine learning_rate: 0.00001 @@ -66,5 +66,11 @@ weight_decay: 0.0 fsdp: - full_shard fsdp_config: + fsdp_limit_all_gathers: true + fsdp_sync_module_states: true + fsdp_offload_params: true + fsdp_use_orig_params: false + fsdp_cpu_ram_efficient_loading: true fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer + fsdp_state_dict_type: SHARDED_STATE_DICT special_tokens: diff --git a/examples/mistral/Mistral-7b-example/README.md b/examples/mistral/Mistral-7b-example/README.md deleted file mode 100644 index 2d5ac87a17..0000000000 --- a/examples/mistral/Mistral-7b-example/README.md +++ /dev/null @@ -1,12 +0,0 @@ -# Description -This repository presents an in-depth guide for fine-tuning Mistral-7b or any other compatible model using Axolotl, tailored specifically for chatbot development. It streamlines the process of fine-tuning and uploading the enhanced model to HuggingFace πŸ€—, thereby serving as an invaluable tool for developers in the AI and chatbot domain. - -**What’s Inside:** - -Beginner-Friendly Instructions: Comprehensive steps to guide you through fine-tuning your chosen model, including details on the data structure (jsonl), configuration, and the code itself. - -Hardware Utilized: For reference, the fine-tuning in this guide was performed using 4x NVIDIA GeForce RTX 3090 (rented 2.1.2-cuda12.1-cudnn8-devel). - -**Uploading to HuggingFace πŸ€—:** -To upload your fine-tuned model to Hugging Face, include the following files: -![Screenshot 2024-01-19 213932](https://github.com/OpenAccess-AI-Collective/axolotl/assets/138583191/d660eb84-2d76-46a1-9846-cf0aeb3006d9) diff --git a/examples/mistral/Mistral-7b-example/code.ipynb b/examples/mistral/Mistral-7b-example/code.ipynb deleted file mode 100644 index 7e84d8124a..0000000000 --- a/examples/mistral/Mistral-7b-example/code.ipynb +++ /dev/null @@ -1,970 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "3fe31229-8f6b-48bc-a86d-af8e5466d11c", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "GPU available? True\n", - "BF16 is supported? True\n" - ] - } - ], - "source": [ - "# Check if GPU is available I used 4x NVIDIA GeForce RTX 3090 (rented 2.1.2-cuda12.1-cudnn8-devel)\n", - "import torch\n", - "print('GPU available?', torch.cuda.is_available())\n", - "print('BF16 is supported?', torch.cuda.is_bf16_supported())" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "1dee845b-f3cb-4b1e-bdd9-1a918eac140b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting huggingface_hub\n", - " Downloading huggingface_hub-0.20.1-py3-none-any.whl.metadata (12 kB)\n", - "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (3.9.0)\n", - "Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2023.10.0)\n", - "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2.31.0)\n", - "Requirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.65.0)\n", - "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (6.0.1)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.7.1)\n", - "Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (23.1)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2.0.4)\n", - "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (3.4)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (1.26.18)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2023.7.22)\n", - "Downloading huggingface_hub-0.20.1-py3-none-any.whl (330 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m330.1/330.1 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n", - "\u001b[?25hInstalling collected packages: huggingface_hub\n", - "Successfully installed huggingface_hub-0.20.1\n", - "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!pip install huggingface_hub" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "88731672-9050-4034-8266-11aaace2a44e", - "metadata": {}, - "outputs": [], - "source": [ - "from huggingface_hub import notebook_login" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "6b5aa7d7-3b18-4c14-afd4-043c2c545259", - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "60df98d7b0294289aad8b6c8cd023c3b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "VBox(children=(HTML(value='
=0.41.1\n", - " Downloading bitsandbytes-0.41.3.post2-py3-none-any.whl.metadata (9.8 kB)\n", - "Collecting accelerate==0.24.1\n", - " Downloading accelerate-0.24.1-py3-none-any.whl.metadata (18 kB)\n", - "Collecting addict\n", - " Downloading addict-2.4.0-py3-none-any.whl (3.8 kB)\n", - "Collecting fire\n", - " Downloading fire-0.5.0.tar.gz (88 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m88.3/88.3 kB\u001b[0m \u001b[31m28.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n", - "\u001b[?25hRequirement already satisfied: PyYAML>=6.0 in /opt/conda/lib/python3.10/site-packages (6.0.1)\n", - "Collecting datasets>=2.15.0\n", - " Downloading datasets-2.16.0-py3-none-any.whl.metadata (20 kB)\n", - "Collecting sentencepiece\n", - " Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m47.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting wandb\n", - " Downloading wandb-0.16.1-py3-none-any.whl.metadata (9.8 kB)\n", - "Collecting einops\n", - " Downloading einops-0.7.0-py3-none-any.whl.metadata (13 kB)\n", - "Collecting optimum==1.13.2\n", - " Downloading optimum-1.13.2.tar.gz (300 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m301.0/301.0 kB\u001b[0m \u001b[31m72.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Installing build dependencies ... \u001b[?25ldone\n", - "\u001b[?25h Getting requirements to build wheel ... \u001b[?25ldone\n", - "\u001b[?25h Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n", - "\u001b[?25hCollecting hf_transfer\n", - " Downloading hf_transfer-0.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.5 kB)\n", - "Collecting colorama\n", - " Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n", - "Collecting numba\n", - " Downloading numba-0.58.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.7 kB)\n", - "Requirement already satisfied: numpy>=1.24.4 in /opt/conda/lib/python3.10/site-packages (1.26.0)\n", - "Collecting bert-score==0.3.13\n", - " Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m61.1/61.1 kB\u001b[0m \u001b[31m20.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting evaluate==0.4.0\n", - " Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m81.4/81.4 kB\u001b[0m \u001b[31m26.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting rouge-score==0.1.2\n", - " Downloading rouge_score-0.1.2.tar.gz (17 kB)\n", - " Preparing metadata (setup.py) ... \u001b[?25ldone\n", - "\u001b[?25hCollecting scipy\n", - " Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.4/60.4 kB\u001b[0m \u001b[31m17.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting scikit-learn==1.2.2\n", - " Downloading scikit_learn-1.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.6/9.6 MB\u001b[0m \u001b[31m83.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0mm\n", - "\u001b[?25hCollecting pynvml\n", - " Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.1/53.1 kB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting art\n", - " Downloading art-6.1-py3-none-any.whl.metadata (69 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m69.9/69.9 kB\u001b[0m \u001b[31m21.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting fschat==0.2.34\n", - " Downloading fschat-0.2.34-py3-none-any.whl.metadata (20 kB)\n", - "Collecting gradio==3.50.2\n", - " Downloading gradio-3.50.2-py3-none-any.whl.metadata (17 kB)\n", - "Collecting tensorboard\n", - " Downloading tensorboard-2.15.1-py3-none-any.whl.metadata (1.7 kB)\n", - "Collecting s3fs\n", - " Downloading s3fs-2023.12.2-py3-none-any.whl.metadata (1.6 kB)\n", - "Collecting gcsfs\n", - " Downloading gcsfs-2023.12.2.post1-py2.py3-none-any.whl.metadata (1.6 kB)\n", - "Collecting xformers==0.0.23\n", - " Downloading xformers-0.0.23-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.0 kB)\n", - "Collecting deepspeed\n", - " Downloading deepspeed-0.12.6.tar.gz (1.2 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m109.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n", - "\u001b[?25hCollecting flash-attn==2.3.3\n", - " Downloading flash_attn-2.3.3.tar.gz (2.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.3/2.3 MB\u001b[0m \u001b[31m111.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n", - "\u001b[?25hRequirement already satisfied: psutil in /opt/conda/lib/python3.10/site-packages (from accelerate==0.24.1) (5.9.0)\n", - "Requirement already satisfied: torch>=1.10.0 in /opt/conda/lib/python3.10/site-packages (from accelerate==0.24.1) (2.1.1)\n", - "Requirement already satisfied: huggingface-hub in /opt/conda/lib/python3.10/site-packages (from accelerate==0.24.1) (0.20.1)\n", - "Collecting rouge (from auto-gptq==0.5.1)\n", - " Downloading rouge-1.0.1-py3-none-any.whl (13 kB)\n", - "Collecting gekko (from auto-gptq==0.5.1)\n", - " Downloading gekko-1.0.6-py3-none-any.whl (12.2 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.2/12.2 MB\u001b[0m \u001b[31m77.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m0:01\u001b[0m\n", - "\u001b[?25hCollecting safetensors (from auto-gptq==0.5.1)\n", - " Downloading safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)\n", - "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from auto-gptq==0.5.1) (4.65.0)\n", - "Collecting pandas>=1.0.1 (from bert-score==0.3.13)\n", - " Downloading pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)\n", - "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from bert-score==0.3.13) (2.31.0)\n", - "Collecting matplotlib (from bert-score==0.3.13)\n", - " Downloading matplotlib-3.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)\n", - "Collecting dill (from evaluate==0.4.0)\n", - " Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)\n", - "Collecting xxhash (from evaluate==0.4.0)\n", - " Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n", - "Collecting multiprocess (from evaluate==0.4.0)\n", - " Downloading multiprocess-0.70.15-py310-none-any.whl.metadata (7.2 kB)\n", - "Requirement already satisfied: fsspec>=2021.05.0 in /opt/conda/lib/python3.10/site-packages (from fsspec[http]>=2021.05.0->evaluate==0.4.0) (2023.10.0)\n", - "Collecting responses<0.19 (from evaluate==0.4.0)\n", - " Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n", - "Collecting ninja (from flash-attn==2.3.3)\n", - " Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl.metadata (5.3 kB)\n", - "Collecting aiohttp (from fschat==0.2.34)\n", - " Downloading aiohttp-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)\n", - "Collecting fastapi (from fschat==0.2.34)\n", - " Downloading fastapi-0.108.0-py3-none-any.whl.metadata (24 kB)\n", - "Collecting httpx (from fschat==0.2.34)\n", - " Downloading httpx-0.26.0-py3-none-any.whl.metadata (7.6 kB)\n", - "Collecting markdown2[all] (from fschat==0.2.34)\n", - " Downloading markdown2-2.4.12-py2.py3-none-any.whl.metadata (2.0 kB)\n", - "Collecting nh3 (from fschat==0.2.34)\n", - " Downloading nh3-0.2.15-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.7 kB)\n", - "Requirement already satisfied: prompt-toolkit>=3.0.0 in /opt/conda/lib/python3.10/site-packages (from fschat==0.2.34) (3.0.36)\n", - "Collecting pydantic<2,>=1 (from fschat==0.2.34)\n", - " Downloading pydantic-1.10.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (149 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m149.6/149.6 kB\u001b[0m \u001b[31m42.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting rich>=10.0.0 (from fschat==0.2.34)\n", - " Downloading rich-13.7.0-py3-none-any.whl.metadata (18 kB)\n", - "Collecting shortuuid (from fschat==0.2.34)\n", - " Downloading shortuuid-1.0.11-py3-none-any.whl (10 kB)\n", - "Collecting tiktoken (from fschat==0.2.34)\n", - " Downloading tiktoken-0.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n", - "Collecting uvicorn (from fschat==0.2.34)\n", - " Downloading uvicorn-0.25.0-py3-none-any.whl.metadata (6.4 kB)\n", - "Collecting aiofiles<24.0,>=22.0 (from gradio==3.50.2)\n", - " Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)\n", - "Collecting altair<6.0,>=4.2.0 (from gradio==3.50.2)\n", - " Downloading altair-5.2.0-py3-none-any.whl.metadata (8.7 kB)\n", - "Collecting ffmpy (from gradio==3.50.2)\n", - " Downloading ffmpy-0.3.1.tar.gz (5.5 kB)\n", - " Preparing metadata (setup.py) ... \u001b[?25ldone\n", - "\u001b[?25hCollecting gradio-client==0.6.1 (from gradio==3.50.2)\n", - " Downloading gradio_client-0.6.1-py3-none-any.whl.metadata (7.1 kB)\n", - "Collecting importlib-resources<7.0,>=1.3 (from gradio==3.50.2)\n", - " Downloading importlib_resources-6.1.1-py3-none-any.whl.metadata (4.1 kB)\n", - "Requirement already satisfied: jinja2<4.0 in /opt/conda/lib/python3.10/site-packages (from gradio==3.50.2) (3.1.2)\n", - "Requirement already satisfied: markupsafe~=2.0 in /opt/conda/lib/python3.10/site-packages (from gradio==3.50.2) (2.1.1)\n", - "Collecting orjson~=3.0 (from gradio==3.50.2)\n", - " Downloading orjson-3.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (49 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.3/49.3 kB\u001b[0m \u001b[31m14.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: pillow<11.0,>=8.0 in /opt/conda/lib/python3.10/site-packages (from gradio==3.50.2) (10.0.1)\n", - "Collecting pydub (from gradio==3.50.2)\n", - " Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n", - "Collecting python-multipart (from gradio==3.50.2)\n", - " Downloading python_multipart-0.0.6-py3-none-any.whl (45 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.7/45.7 kB\u001b[0m \u001b[31m13.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting semantic-version~=2.0 (from gradio==3.50.2)\n", - " Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)\n", - "Requirement already satisfied: typing-extensions~=4.0 in /opt/conda/lib/python3.10/site-packages (from gradio==3.50.2) (4.7.1)\n", - "Collecting websockets<12.0,>=10.0 (from gradio==3.50.2)\n", - " Downloading websockets-11.0.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (129 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m129.9/129.9 kB\u001b[0m \u001b[31m30.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting coloredlogs (from optimum==1.13.2)\n", - " Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m11.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from optimum==1.13.2) (1.11.1)\n", - "Collecting absl-py (from rouge-score==0.1.2)\n", - " Downloading absl_py-2.0.0-py3-none-any.whl.metadata (2.3 kB)\n", - "Collecting nltk (from rouge-score==0.1.2)\n", - " Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m90.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: six>=1.14.0 in /opt/conda/lib/python3.10/site-packages (from rouge-score==0.1.2) (1.16.0)\n", - "Collecting joblib>=1.1.1 (from scikit-learn==1.2.2)\n", - " Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)\n", - "Collecting threadpoolctl>=2.0.0 (from scikit-learn==1.2.2)\n", - " Downloading threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)\n", - "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from transformers==4.36.2) (3.9.0)\n", - "Collecting regex!=2019.12.17 (from transformers==4.36.2)\n", - " Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.9/40.9 kB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch>=1.10.0->accelerate==0.24.1) (3.1)\n", - "Collecting pyarrow>=8.0.0 (from datasets>=2.15.0)\n", - " Downloading pyarrow-14.0.2-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.0 kB)\n", - "Collecting pyarrow-hotfix (from datasets>=2.15.0)\n", - " Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)\n", - "Collecting hjson (from deepspeed)\n", - " Downloading hjson-3.1.0-py3-none-any.whl (54 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.0/54.0 kB\u001b[0m \u001b[31m19.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting py-cpuinfo (from deepspeed)\n", - " Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)\n", - "Collecting termcolor (from fire)\n", - " Downloading termcolor-2.4.0-py3-none-any.whl.metadata (6.1 kB)\n", - "Requirement already satisfied: decorator>4.1.2 in /opt/conda/lib/python3.10/site-packages (from gcsfs) (5.1.1)\n", - "INFO: pip is looking at multiple versions of gcsfs to determine which version is compatible with other requirements. This could take a while.\n", - "Collecting gcsfs\n", - " Downloading gcsfs-2023.12.1-py2.py3-none-any.whl.metadata (1.6 kB)\n", - " Downloading gcsfs-2023.12.0-py2.py3-none-any.whl.metadata (1.6 kB)\n", - " Downloading gcsfs-2023.10.0-py2.py3-none-any.whl.metadata (1.6 kB)\n", - "Collecting google-auth>=1.2 (from gcsfs)\n", - " Downloading google_auth-2.25.2-py2.py3-none-any.whl.metadata (4.7 kB)\n", - "Collecting google-auth-oauthlib (from gcsfs)\n", - " Downloading google_auth_oauthlib-1.2.0-py2.py3-none-any.whl.metadata (2.7 kB)\n", - "Collecting google-cloud-storage (from gcsfs)\n", - " Downloading google_cloud_storage-2.14.0-py2.py3-none-any.whl.metadata (6.1 kB)\n", - "Collecting llvmlite<0.42,>=0.41.0dev0 (from numba)\n", - " Downloading llvmlite-0.41.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.8 kB)\n", - "Collecting aiobotocore<3.0.0,>=2.5.4 (from s3fs)\n", - " Downloading aiobotocore-2.9.0-py3-none-any.whl.metadata (20 kB)\n", - "INFO: pip is looking at multiple versions of s3fs to determine which version is compatible with other requirements. This could take a while.\n", - "Collecting s3fs\n", - " Downloading s3fs-2023.12.1-py3-none-any.whl.metadata (1.6 kB)\n", - " Downloading s3fs-2023.10.0-py3-none-any.whl.metadata (1.6 kB)\n", - "Collecting aiobotocore~=2.7.0 (from s3fs)\n", - " Downloading aiobotocore-2.7.0-py3-none-any.whl.metadata (20 kB)\n", - "Collecting grpcio>=1.48.2 (from tensorboard)\n", - " Downloading grpcio-1.60.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)\n", - "Collecting markdown>=2.6.8 (from tensorboard)\n", - " Downloading Markdown-3.5.1-py3-none-any.whl.metadata (7.1 kB)\n", - "Collecting protobuf<4.24,>=3.19.6 (from tensorboard)\n", - " Downloading protobuf-4.23.4-cp37-abi3-manylinux2014_x86_64.whl.metadata (540 bytes)\n", - "Requirement already satisfied: setuptools>=41.0.0 in /opt/conda/lib/python3.10/site-packages (from tensorboard) (68.0.0)\n", - "Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard)\n", - " Downloading tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl.metadata (1.1 kB)\n", - "Collecting werkzeug>=1.0.1 (from tensorboard)\n", - " Downloading werkzeug-3.0.1-py3-none-any.whl.metadata (4.1 kB)\n", - "Requirement already satisfied: Click!=8.0.0,>=7.1 in /opt/conda/lib/python3.10/site-packages (from wandb) (8.1.7)\n", - "Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)\n", - " Downloading GitPython-3.1.40-py3-none-any.whl.metadata (12 kB)\n", - "Collecting sentry-sdk>=1.0.0 (from wandb)\n", - " Downloading sentry_sdk-1.39.1-py2.py3-none-any.whl.metadata (9.7 kB)\n", - "Collecting docker-pycreds>=0.4.0 (from wandb)\n", - " Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)\n", - "Collecting setproctitle (from wandb)\n", - " Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)\n", - "Collecting appdirs>=1.4.3 (from wandb)\n", - " Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)\n", - "Collecting botocore<1.31.65,>=1.31.16 (from aiobotocore~=2.7.0->s3fs)\n", - " Downloading botocore-1.31.64-py3-none-any.whl.metadata (6.1 kB)\n", - "Collecting wrapt<2.0.0,>=1.10.10 (from aiobotocore~=2.7.0->s3fs)\n", - " Downloading wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n", - "Collecting aioitertools<1.0.0,>=0.5.1 (from aiobotocore~=2.7.0->s3fs)\n", - " Downloading aioitertools-0.11.0-py3-none-any.whl (23 kB)\n", - "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->fschat==0.2.34) (23.1.0)\n", - "Collecting multidict<7.0,>=4.5 (from aiohttp->fschat==0.2.34)\n", - " Downloading multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m114.5/114.5 kB\u001b[0m \u001b[31m37.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting yarl<2.0,>=1.0 (from aiohttp->fschat==0.2.34)\n", - " Downloading yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (31 kB)\n", - "Collecting frozenlist>=1.1.1 (from aiohttp->fschat==0.2.34)\n", - " Downloading frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n", - "Collecting aiosignal>=1.1.2 (from aiohttp->fschat==0.2.34)\n", - " Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)\n", - "Collecting async-timeout<5.0,>=4.0 (from aiohttp->fschat==0.2.34)\n", - " Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)\n", - "Requirement already satisfied: jsonschema>=3.0 in /opt/conda/lib/python3.10/site-packages (from altair<6.0,>=4.2.0->gradio==3.50.2) (4.20.0)\n", - "Requirement already satisfied: toolz in /opt/conda/lib/python3.10/site-packages (from altair<6.0,>=4.2.0->gradio==3.50.2) (0.12.0)\n", - "Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wandb)\n", - " Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)\n", - "Collecting cachetools<6.0,>=2.0.0 (from google-auth>=1.2->gcsfs)\n", - " Downloading cachetools-5.3.2-py3-none-any.whl.metadata (5.2 kB)\n", - "Collecting pyasn1-modules>=0.2.1 (from google-auth>=1.2->gcsfs)\n", - " Downloading pyasn1_modules-0.3.0-py2.py3-none-any.whl (181 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m181.3/181.3 kB\u001b[0m \u001b[31m59.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting rsa<5,>=3.1.4 (from google-auth>=1.2->gcsfs)\n", - " Downloading rsa-4.9-py3-none-any.whl (34 kB)\n", - "Collecting requests-oauthlib>=0.7.0 (from google-auth-oauthlib->gcsfs)\n", - " Downloading requests_oauthlib-1.3.1-py2.py3-none-any.whl (23 kB)\n", - "Collecting contourpy>=1.0.1 (from matplotlib->bert-score==0.3.13)\n", - " Downloading contourpy-1.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)\n", - "Collecting cycler>=0.10 (from matplotlib->bert-score==0.3.13)\n", - " Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)\n", - "Collecting fonttools>=4.22.0 (from matplotlib->bert-score==0.3.13)\n", - " Downloading fonttools-4.47.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (157 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m157.2/157.2 kB\u001b[0m \u001b[31m41.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting kiwisolver>=1.3.1 (from matplotlib->bert-score==0.3.13)\n", - " Downloading kiwisolver-1.4.5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (6.4 kB)\n", - "Collecting pyparsing>=2.3.1 (from matplotlib->bert-score==0.3.13)\n", - " Downloading pyparsing-3.1.1-py3-none-any.whl.metadata (5.1 kB)\n", - "Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.10/site-packages (from matplotlib->bert-score==0.3.13) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas>=1.0.1->bert-score==0.3.13) (2023.3.post1)\n", - "Collecting tzdata>=2022.1 (from pandas>=1.0.1->bert-score==0.3.13)\n", - " Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.8/341.8 kB\u001b[0m \u001b[31m72.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: wcwidth in /opt/conda/lib/python3.10/site-packages (from prompt-toolkit>=3.0.0->fschat==0.2.34) (0.2.5)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->bert-score==0.3.13) (2.0.4)\n", - "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->bert-score==0.3.13) (3.4)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->bert-score==0.3.13) (1.26.18)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->bert-score==0.3.13) (2023.7.22)\n", - "Collecting markdown-it-py>=2.2.0 (from rich>=10.0.0->fschat==0.2.34)\n", - " Downloading markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)\n", - "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /opt/conda/lib/python3.10/site-packages (from rich>=10.0.0->fschat==0.2.34) (2.15.1)\n", - "Collecting h11>=0.8 (from uvicorn->fschat==0.2.34)\n", - " Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m21.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting humanfriendly>=9.1 (from coloredlogs->optimum==1.13.2)\n", - " Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m27.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting starlette<0.33.0,>=0.29.0 (from fastapi->fschat==0.2.34)\n", - " Downloading starlette-0.32.0.post1-py3-none-any.whl.metadata (5.8 kB)\n", - "Collecting typing-extensions~=4.0 (from gradio==3.50.2)\n", - " Downloading typing_extensions-4.9.0-py3-none-any.whl.metadata (3.0 kB)\n", - "Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 (from google-cloud-storage->gcsfs)\n", - " Downloading google_api_core-2.15.0-py3-none-any.whl.metadata (2.7 kB)\n", - "Collecting google-cloud-core<3.0dev,>=2.3.0 (from google-cloud-storage->gcsfs)\n", - " Downloading google_cloud_core-2.4.1-py2.py3-none-any.whl.metadata (2.7 kB)\n", - "Collecting google-resumable-media>=2.6.0 (from google-cloud-storage->gcsfs)\n", - " Downloading google_resumable_media-2.7.0-py2.py3-none-any.whl.metadata (2.2 kB)\n", - "Collecting google-crc32c<2.0dev,>=1.0 (from google-cloud-storage->gcsfs)\n", - " Downloading google_crc32c-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (32 kB)\n", - "Requirement already satisfied: anyio in /opt/conda/lib/python3.10/site-packages (from httpx->fschat==0.2.34) (4.2.0)\n", - "Collecting httpcore==1.* (from httpx->fschat==0.2.34)\n", - " Downloading httpcore-1.0.2-py3-none-any.whl.metadata (20 kB)\n", - "Requirement already satisfied: sniffio in /opt/conda/lib/python3.10/site-packages (from httpx->fschat==0.2.34) (1.3.0)\n", - "Collecting wavedrom (from markdown2[all]->fschat==0.2.34)\n", - " Downloading wavedrom-2.0.3.post3.tar.gz (137 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m137.7/137.7 kB\u001b[0m \u001b[31m47.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n", - "\u001b[?25hRequirement already satisfied: mpmath>=0.19 in /opt/conda/lib/python3.10/site-packages (from sympy->optimum==1.13.2) (1.3.0)\n", - "Collecting jmespath<2.0.0,>=0.7.1 (from botocore<1.31.65,>=1.31.16->aiobotocore~=2.7.0->s3fs)\n", - " Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)\n", - "Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb)\n", - " Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)\n", - "Collecting googleapis-common-protos<2.0.dev0,>=1.56.2 (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-storage->gcsfs)\n", - " Downloading googleapis_common_protos-1.62.0-py2.py3-none-any.whl.metadata (1.5 kB)\n", - "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /opt/conda/lib/python3.10/site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio==3.50.2) (2023.12.1)\n", - "Requirement already satisfied: referencing>=0.28.4 in /opt/conda/lib/python3.10/site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio==3.50.2) (0.32.0)\n", - "Requirement already satisfied: rpds-py>=0.7.1 in /opt/conda/lib/python3.10/site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio==3.50.2) (0.15.2)\n", - "Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich>=10.0.0->fschat==0.2.34)\n", - " Downloading mdurl-0.1.2-py3-none-any.whl (10.0 kB)\n", - "Collecting pyasn1<0.6.0,>=0.4.6 (from pyasn1-modules>=0.2.1->google-auth>=1.2->gcsfs)\n", - " Downloading pyasn1-0.5.1-py2.py3-none-any.whl.metadata (8.6 kB)\n", - "Collecting oauthlib>=3.0.0 (from requests-oauthlib>=0.7.0->google-auth-oauthlib->gcsfs)\n", - " Downloading oauthlib-3.2.2-py3-none-any.whl (151 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m151.7/151.7 kB\u001b[0m \u001b[31m50.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: exceptiongroup>=1.0.2 in /opt/conda/lib/python3.10/site-packages (from anyio->httpx->fschat==0.2.34) (1.0.4)\n", - "Collecting svgwrite (from wavedrom->markdown2[all]->fschat==0.2.34)\n", - " Downloading svgwrite-1.4.3-py3-none-any.whl (67 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.1/67.1 kB\u001b[0m \u001b[31m21.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading accelerate-0.24.1-py3-none-any.whl (261 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m261.4/261.4 kB\u001b[0m \u001b[31m53.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading auto_gptq-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.8 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.8/4.8 MB\u001b[0m \u001b[31m89.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n", - "\u001b[?25hDownloading fschat-0.2.34-py3-none-any.whl (220 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m220.1/220.1 kB\u001b[0m \u001b[31m63.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading gradio-3.50.2-py3-none-any.whl (20.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m20.3/20.3 MB\u001b[0m \u001b[31m82.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hDownloading peft-0.6.0-py3-none-any.whl (134 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.9/134.9 kB\u001b[0m \u001b[31m40.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m87.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n", - "\u001b[?25hDownloading transformers-4.36.2-py3-none-any.whl (8.2 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.2/8.2 MB\u001b[0m \u001b[31m90.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n", - "\u001b[?25hDownloading xformers-0.0.23-cp310-cp310-manylinux2014_x86_64.whl (213.0 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m213.0/213.0 MB\u001b[0m \u001b[31m36.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hDownloading gradio_client-0.6.1-py3-none-any.whl (299 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m299.2/299.2 kB\u001b[0m \u001b[31m64.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading bitsandbytes-0.41.3.post2-py3-none-any.whl (92.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.6/92.6 MB\u001b[0m \u001b[31m56.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hDownloading datasets-2.16.0-py3-none-any.whl (507 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m507.1/507.1 kB\u001b[0m \u001b[31m87.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.4 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.4/36.4 MB\u001b[0m \u001b[31m77.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hDownloading art-6.1-py3-none-any.whl (599 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m599.8/599.8 kB\u001b[0m \u001b[31m96.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading einops-0.7.0-py3-none-any.whl (44 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.6/44.6 kB\u001b[0m \u001b[31m13.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading gcsfs-2023.10.0-py2.py3-none-any.whl (33 kB)\n", - "Downloading hf_transfer-0.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.9 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.9/3.9 MB\u001b[0m \u001b[31m99.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n", - "\u001b[?25hDownloading numba-0.58.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.6/3.6 MB\u001b[0m \u001b[31m100.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hDownloading s3fs-2023.10.0-py3-none-any.whl (28 kB)\n", - "Downloading tensorboard-2.15.1-py3-none-any.whl (5.5 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m96.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n", - "\u001b[?25hDownloading wandb-0.16.1-py3-none-any.whl (2.1 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.1/2.1 MB\u001b[0m \u001b[31m99.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading absl_py-2.0.0-py3-none-any.whl (130 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m130.2/130.2 kB\u001b[0m \u001b[31m36.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading aiobotocore-2.7.0-py3-none-any.whl (73 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.5/73.5 kB\u001b[0m \u001b[31m25.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading aiofiles-23.2.1-py3-none-any.whl (15 kB)\n", - "Downloading aiohttp-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m99.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading altair-5.2.0-py3-none-any.whl (996 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m996.9/996.9 kB\u001b[0m \u001b[31m110.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading dill-0.3.7-py3-none-any.whl (115 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m34.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading GitPython-3.1.40-py3-none-any.whl (190 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m190.6/190.6 kB\u001b[0m \u001b[31m47.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading google_auth-2.25.2-py2.py3-none-any.whl (184 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m184.2/184.2 kB\u001b[0m \u001b[31m44.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading google_auth_oauthlib-1.2.0-py2.py3-none-any.whl (24 kB)\n", - "Downloading grpcio-1.60.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m102.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", - "\u001b[?25hDownloading importlib_resources-6.1.1-py3-none-any.whl (33 kB)\n", - "Downloading joblib-1.3.2-py3-none-any.whl (302 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m302.2/302.2 kB\u001b[0m \u001b[31m64.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading llvmlite-0.41.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (43.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.6/43.6 MB\u001b[0m \u001b[31m74.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hDownloading Markdown-3.5.1-py3-none-any.whl (102 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m102.2/102.2 kB\u001b[0m \u001b[31m34.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading matplotlib-3.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.6/11.6 MB\u001b[0m \u001b[31m99.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m0:01\u001b[0m\n", - "\u001b[?25hDownloading orjson-3.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (138 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m138.7/138.7 kB\u001b[0m \u001b[31m38.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.3/12.3 MB\u001b[0m \u001b[31m96.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m0:01\u001b[0m\n", - "\u001b[?25hDownloading protobuf-4.23.4-cp37-abi3-manylinux2014_x86_64.whl (304 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m304.5/304.5 kB\u001b[0m \u001b[31m68.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading pyarrow-14.0.2-cp310-cp310-manylinux_2_28_x86_64.whl (38.0 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.0/38.0 MB\u001b[0m \u001b[31m78.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hDownloading pydantic-1.10.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m95.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m774.0/774.0 kB\u001b[0m \u001b[31m116.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading rich-13.7.0-py3-none-any.whl (240 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m240.6/240.6 kB\u001b[0m \u001b[31m59.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m102.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading sentry_sdk-1.39.1-py2.py3-none-any.whl (254 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m254.1/254.1 kB\u001b[0m \u001b[31m71.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl (6.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.6/6.6 MB\u001b[0m \u001b[31m104.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", - "\u001b[?25hDownloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)\n", - "Downloading uvicorn-0.25.0-py3-none-any.whl (60 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.3/60.3 kB\u001b[0m \u001b[31m19.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading werkzeug-3.0.1-py3-none-any.whl (226 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m226.7/226.7 kB\u001b[0m \u001b[31m67.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading fastapi-0.108.0-py3-none-any.whl (92 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.0/92.0 kB\u001b[0m \u001b[31m33.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading typing_extensions-4.9.0-py3-none-any.whl (32 kB)\n", - "Downloading google_cloud_storage-2.14.0-py2.py3-none-any.whl (121 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.6/121.6 kB\u001b[0m \u001b[31m36.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading httpx-0.26.0-py3-none-any.whl (75 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.9/75.9 kB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading httpcore-1.0.2-py3-none-any.whl (76 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.9/76.9 kB\u001b[0m \u001b[31m28.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m48.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading nh3-0.2.15-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m108.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m307.2/307.2 kB\u001b[0m \u001b[31m66.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n", - "Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)\n", - "Downloading termcolor-2.4.0-py3-none-any.whl (7.7 kB)\n", - "Downloading tiktoken-0.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m101.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m44.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)\n", - "Downloading botocore-1.31.64-py3-none-any.whl (11.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.3/11.3 MB\u001b[0m \u001b[31m98.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m0:01\u001b[0m\n", - "\u001b[?25hDownloading cachetools-5.3.2-py3-none-any.whl (9.3 kB)\n", - "Downloading contourpy-1.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (310 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m310.7/310.7 kB\u001b[0m \u001b[31m69.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading cycler-0.12.1-py3-none-any.whl (8.3 kB)\n", - "Downloading fonttools-4.47.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.6/4.6 MB\u001b[0m \u001b[31m102.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", - "\u001b[?25hDownloading frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (239 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m239.5/239.5 kB\u001b[0m \u001b[31m71.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading gitdb-4.0.11-py3-none-any.whl (62 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.7/62.7 kB\u001b[0m \u001b[31m23.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading google_api_core-2.15.0-py3-none-any.whl (121 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m122.0/122.0 kB\u001b[0m \u001b[31m32.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading google_cloud_core-2.4.1-py2.py3-none-any.whl (29 kB)\n", - "Downloading google_resumable_media-2.7.0-py2.py3-none-any.whl (80 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m80.6/80.6 kB\u001b[0m \u001b[31m22.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading kiwisolver-1.4.5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m102.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading markdown_it_py-3.0.0-py3-none-any.whl (87 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m87.5/87.5 kB\u001b[0m \u001b[31m25.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading pyparsing-3.1.1-py3-none-any.whl (103 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m103.1/103.1 kB\u001b[0m \u001b[31m32.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading starlette-0.32.0.post1-py3-none-any.whl (70 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m70.0/70.0 kB\u001b[0m \u001b[31m19.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (80 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m80.3/80.3 kB\u001b[0m \u001b[31m30.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (301 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m301.6/301.6 kB\u001b[0m \u001b[31m80.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading markdown2-2.4.12-py2.py3-none-any.whl (41 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.2/41.2 kB\u001b[0m \u001b[31m12.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading googleapis_common_protos-1.62.0-py2.py3-none-any.whl (228 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m228.7/228.7 kB\u001b[0m \u001b[31m57.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading pyasn1-0.5.1-py2.py3-none-any.whl (84 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.9/84.9 kB\u001b[0m \u001b[31m30.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading smmap-5.0.1-py3-none-any.whl (24 kB)\n", - "Building wheels for collected packages: flash-attn, optimum, rouge-score, deepspeed, fire, ffmpy, wavedrom\n", - " Building wheel for flash-attn (setup.py) ... \u001b[?25ldone\n", - "\u001b[?25h Created wheel for flash-attn: filename=flash_attn-2.3.3-cp310-cp310-linux_x86_64.whl size=57042553 sha256=b1df92cb5bd7657d38b789dd48e907aa3e0bd2715c817eb85f3c4320bb11fb3f\n", - " Stored in directory: /root/.cache/pip/wheels/e5/e6/fa/941802ec61d1afd320d27160ab1db98e6dba65381f84b76d4a\n", - " Building wheel for optimum (pyproject.toml) ... \u001b[?25ldone\n", - "\u001b[?25h Created wheel for optimum: filename=optimum-1.13.2-py3-none-any.whl size=395599 sha256=ff3a73120e1b6eeeda28f76e3fc8cd4cd826e5d66c869b7848ba150e7af79c62\n", - " Stored in directory: /root/.cache/pip/wheels/6e/b7/2c/79405d98f0943373d8546daeae25a3d377f7659ca0cbe48699\n", - " Building wheel for rouge-score (setup.py) ... \u001b[?25ldone\n", - "\u001b[?25h Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24932 sha256=8118ecbbcd3529085e794c803f0ddb182fc6c6d3e8a494103b49a94abf1bec37\n", - " Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4\n", - " Building wheel for deepspeed (setup.py) ... \u001b[?25ldone\n", - "\u001b[?25h Created wheel for deepspeed: filename=deepspeed-0.12.6-py3-none-any.whl size=1306729 sha256=35c46b6f0275b0d3063522e0af4f3cbd9ec1c310114d8917d87cbe2bf43346e2\n", - " Stored in directory: /root/.cache/pip/wheels/a3/dc/a2/f585faaed4dec84108916dcc8e8a7c129a216df8202ca32984\n", - " Building wheel for fire (setup.py) ... \u001b[?25ldone\n", - "\u001b[?25h Created wheel for fire: filename=fire-0.5.0-py2.py3-none-any.whl size=116934 sha256=e76d5185f237f34ec69bb8aa657497bef07408978e4f7efdaef48663bb8cd4ef\n", - " Stored in directory: /root/.cache/pip/wheels/90/d4/f7/9404e5db0116bd4d43e5666eaa3e70ab53723e1e3ea40c9a95\n", - " Building wheel for ffmpy (setup.py) ... \u001b[?25ldone\n", - "\u001b[?25h Created wheel for ffmpy: filename=ffmpy-0.3.1-py3-none-any.whl size=5579 sha256=da3b54dc0ac1a825a1a233315970ac80b8b4c53ebd9cb2a2cfdeab118f453a64\n", - " Stored in directory: /root/.cache/pip/wheels/01/a6/d1/1c0828c304a4283b2c1639a09ad86f83d7c487ef34c6b4a1bf\n", - " Building wheel for wavedrom (setup.py) ... \u001b[?25ldone\n", - "\u001b[?25h Created wheel for wavedrom: filename=wavedrom-2.0.3.post3-py2.py3-none-any.whl size=30052 sha256=7f0cbd15d63ee9c120190bac122ab51bbbfc91ee374bc3c046fadb320816c17e\n", - " Stored in directory: /root/.cache/pip/wheels/9c/52/8c/38b454b42f712f325e26f633287484c7dc1ad469e1580c5954\n", - "Successfully built flash-attn optimum rouge-score deepspeed fire ffmpy wavedrom\n", - "Installing collected packages: sentencepiece, pydub, py-cpuinfo, ninja, nh3, hjson, ffmpy, bitsandbytes, appdirs, addict, xxhash, wrapt, werkzeug, websockets, tzdata, typing-extensions, threadpoolctl, termcolor, tensorboard-data-server, svgwrite, smmap, shortuuid, setproctitle, sentry-sdk, semantic-version, scipy, safetensors, rouge, regex, python-multipart, pyparsing, pynvml, pyasn1, pyarrow-hotfix, pyarrow, protobuf, orjson, oauthlib, multidict, mdurl, markdown2, markdown, llvmlite, kiwisolver, joblib, jmespath, importlib-resources, humanfriendly, hf_transfer, h11, grpcio, google-crc32c, gekko, frozenlist, fonttools, einops, docker-pycreds, dill, cycler, contourpy, colorama, cachetools, async-timeout, art, aioitertools, aiofiles, absl-py, yarl, wavedrom, uvicorn, tiktoken, scikit-learn, rsa, responses, requests-oauthlib, pydantic, pyasn1-modules, pandas, numba, nltk, multiprocess, matplotlib, markdown-it-py, httpcore, googleapis-common-protos, google-resumable-media, gitdb, fire, coloredlogs, botocore, aiosignal, xformers, tokenizers, starlette, rouge-score, rich, httpx, google-auth, GitPython, flash-attn, deepspeed, aiohttp, accelerate, wandb, transformers, gradio-client, google-auth-oauthlib, google-api-core, fastapi, altair, aiobotocore, tensorboard, s3fs, peft, gradio, google-cloud-core, fschat, datasets, bert-score, optimum, google-cloud-storage, evaluate, auto-gptq, gcsfs, axolotl\n", - " Attempting uninstall: typing-extensions\n", - " Found existing installation: typing_extensions 4.7.1\n", - " Uninstalling typing_extensions-4.7.1:\n", - " Successfully uninstalled typing_extensions-4.7.1\n", - " Running setup.py develop for axolotl\n", - "Successfully installed GitPython-3.1.40 absl-py-2.0.0 accelerate-0.24.1 addict-2.4.0 aiobotocore-2.7.0 aiofiles-23.2.1 aiohttp-3.9.1 aioitertools-0.11.0 aiosignal-1.3.1 altair-5.2.0 appdirs-1.4.4 art-6.1 async-timeout-4.0.3 auto-gptq-0.5.1 axolotl-0.3.0 bert-score-0.3.13 bitsandbytes-0.41.3.post2 botocore-1.31.64 cachetools-5.3.2 colorama-0.4.6 coloredlogs-15.0.1 contourpy-1.2.0 cycler-0.12.1 datasets-2.16.0 deepspeed-0.12.6 dill-0.3.7 docker-pycreds-0.4.0 einops-0.7.0 evaluate-0.4.0 fastapi-0.108.0 ffmpy-0.3.1 fire-0.5.0 flash-attn-2.3.3 fonttools-4.47.0 frozenlist-1.4.1 fschat-0.2.34 gcsfs-2023.10.0 gekko-1.0.6 gitdb-4.0.11 google-api-core-2.15.0 google-auth-2.25.2 google-auth-oauthlib-1.2.0 google-cloud-core-2.4.1 google-cloud-storage-2.14.0 google-crc32c-1.5.0 google-resumable-media-2.7.0 googleapis-common-protos-1.62.0 gradio-3.50.2 gradio-client-0.6.1 grpcio-1.60.0 h11-0.14.0 hf_transfer-0.1.4 hjson-3.1.0 httpcore-1.0.2 httpx-0.26.0 humanfriendly-10.0 importlib-resources-6.1.1 jmespath-1.0.1 joblib-1.3.2 kiwisolver-1.4.5 llvmlite-0.41.1 markdown-3.5.1 markdown-it-py-3.0.0 markdown2-2.4.12 matplotlib-3.8.2 mdurl-0.1.2 multidict-6.0.4 multiprocess-0.70.15 nh3-0.2.15 ninja-1.11.1.1 nltk-3.8.1 numba-0.58.1 oauthlib-3.2.2 optimum-1.13.2 orjson-3.9.10 pandas-2.1.4 peft-0.6.0 protobuf-4.23.4 py-cpuinfo-9.0.0 pyarrow-14.0.2 pyarrow-hotfix-0.6 pyasn1-0.5.1 pyasn1-modules-0.3.0 pydantic-1.10.13 pydub-0.25.1 pynvml-11.5.0 pyparsing-3.1.1 python-multipart-0.0.6 regex-2023.12.25 requests-oauthlib-1.3.1 responses-0.18.0 rich-13.7.0 rouge-1.0.1 rouge-score-0.1.2 rsa-4.9 s3fs-2023.10.0 safetensors-0.4.1 scikit-learn-1.2.2 scipy-1.11.4 semantic-version-2.10.0 sentencepiece-0.1.99 sentry-sdk-1.39.1 setproctitle-1.3.3 shortuuid-1.0.11 smmap-5.0.1 starlette-0.32.0.post1 svgwrite-1.4.3 tensorboard-2.15.1 tensorboard-data-server-0.7.2 termcolor-2.4.0 threadpoolctl-3.2.0 tiktoken-0.5.2 tokenizers-0.15.0 transformers-4.36.2 typing-extensions-4.8.0 tzdata-2023.3 uvicorn-0.25.0 wandb-0.16.1 wavedrom-2.0.3.post3 websockets-11.0.3 werkzeug-3.0.1 wrapt-1.16.0 xformers-0.0.23 xxhash-3.4.1 yarl-1.9.4\n", - "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", - "\u001b[0mCollecting git+https://github.com/huggingface/peft.git\n", - " Cloning https://github.com/huggingface/peft.git to /tmp/pip-req-build-hka8xgk2\n", - " Running command git clone --filter=blob:none --quiet https://github.com/huggingface/peft.git /tmp/pip-req-build-hka8xgk2\n", - " Resolved https://github.com/huggingface/peft.git to commit cf04d0353f0343cbf66627228c4495f51669af34\n", - " Installing build dependencies ... \u001b[?25ldone\n", - "\u001b[?25h Getting requirements to build wheel ... \u001b[?25ldone\n", - "\u001b[?25h Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n", - "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (1.26.0)\n", - "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (23.1)\n", - "Requirement already satisfied: psutil in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (5.9.0)\n", - "Requirement already satisfied: pyyaml in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (6.0.1)\n", - "Requirement already satisfied: torch>=1.13.0 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (2.1.1)\n", - "Requirement already satisfied: transformers in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (4.36.2)\n", - "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (4.65.0)\n", - "Requirement already satisfied: accelerate>=0.21.0 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (0.24.1)\n", - "Requirement already satisfied: safetensors in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (0.4.1)\n", - "Requirement already satisfied: huggingface-hub>=0.17.0 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (0.20.1)\n", - "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft==0.7.2.dev0) (3.9.0)\n", - "Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft==0.7.2.dev0) (2023.10.0)\n", - "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft==0.7.2.dev0) (2.31.0)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft==0.7.2.dev0) (4.8.0)\n", - "Requirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft==0.7.2.dev0) (1.11.1)\n", - "Requirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft==0.7.2.dev0) (3.1)\n", - "Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft==0.7.2.dev0) (3.1.2)\n", - "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.10/site-packages (from transformers->peft==0.7.2.dev0) (2023.12.25)\n", - "Requirement already satisfied: tokenizers<0.19,>=0.14 in /opt/conda/lib/python3.10/site-packages (from transformers->peft==0.7.2.dev0) (0.15.0)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch>=1.13.0->peft==0.7.2.dev0) (2.1.1)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft==0.7.2.dev0) (2.0.4)\n", - "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft==0.7.2.dev0) (3.4)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft==0.7.2.dev0) (1.26.18)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft==0.7.2.dev0) (2023.7.22)\n", - "Requirement already satisfied: mpmath>=0.19 in /opt/conda/lib/python3.10/site-packages (from sympy->torch>=1.13.0->peft==0.7.2.dev0) (1.3.0)\n", - "Building wheels for collected packages: peft\n", - " Building wheel for peft (pyproject.toml) ... \u001b[?25ldone\n", - "\u001b[?25h Created wheel for peft: filename=peft-0.7.2.dev0-py3-none-any.whl size=169456 sha256=4c70d23e759fa6abb3827fb2f3a8683be3b24d78777d0f403bbc2c0548e5dd4b\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-my5ncou6/wheels/d7/c7/de/1368fac8590e1b103ddc2ec2a28ad51d83aded1a3830e8a087\n", - "Successfully built peft\n", - "Installing collected packages: peft\n", - " Attempting uninstall: peft\n", - " Found existing installation: peft 0.6.0\n", - " Uninstalling peft-0.6.0:\n", - " Successfully uninstalled peft-0.6.0\n", - "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "axolotl 0.3.0 requires peft==0.6.0, but you have peft 0.7.2.dev0 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0mSuccessfully installed peft-0.7.2.dev0\n", - "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", - "\u001b[0m" - ] - } - ], - "source": [ - "#instaling what is needed inside axolotl file\n", - "!pip install packaging\n", - "!pip install -e '.[flash-attn,deepspeed]'\n", - "!pip install -U git+https://github.com/huggingface/peft.git" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "82d1a380-1e87-48fe-89fe-25331326014d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The following values were not passed to `accelerate launch` and had defaults used instead:\n", - "\t`--num_processes` was set to a value of `3`\n", - "\t\tMore than one GPU was found, enabling multi-GPU training.\n", - "\t\tIf this was unintended please pass in `--num_processes=1`.\n", - "\t`--num_machines` was set to a value of `1`\n", - "\t`--mixed_precision` was set to a value of `'no'`\n", - "\t`--dynamo_backend` was set to a value of `'no'`\n", - "To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`.\n", - "/opt/conda/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n", - " warnings.warn(\n", - "[2023-12-28 15:44:09,979] [INFO] [datasets.:58] [PID:2814] PyTorch version 2.1.1 available.\n", - "/opt/conda/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n", - " warnings.warn(\n", - "/opt/conda/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n", - " warnings.warn(\n", - "[2023-12-28 15:44:10,011] [INFO] [datasets.:58] [PID:2812] PyTorch version 2.1.1 available.\n", - "[2023-12-28 15:44:10,013] [INFO] [datasets.:58] [PID:2813] PyTorch version 2.1.1 available.\n", - "[2023-12-28 15:44:10,805] [INFO] [axolotl.normalize_config:150] [PID:2814] [RANK:2] GPU memory usage baseline: 0.000GB (+0.317GB misc)\u001b[39m\n", - "[2023-12-28 15:44:10,830] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n", - "[2023-12-28 15:44:10,842] [INFO] [axolotl.normalize_config:150] [PID:2813] [RANK:1] GPU memory usage baseline: 0.000GB (+0.317GB misc)\u001b[39m\n", - "[2023-12-28 15:44:10,865] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n", - "[2023-12-28 15:44:10,869] [INFO] [axolotl.normalize_config:150] [PID:2812] [RANK:0] GPU memory usage baseline: 0.000GB (+0.351GB misc)\u001b[39m\n", - "[2023-12-28 15:44:10,887] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n", - "[2023-12-28 15:44:10,961] [INFO] [comm.py:637:init_distributed] cdb=None\n", - "[2023-12-28 15:44:10,994] [INFO] [comm.py:637:init_distributed] cdb=None\n", - "[2023-12-28 15:44:11,015] [INFO] [comm.py:637:init_distributed] cdb=None\n", - "[2023-12-28 15:44:11,015] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n", - " dP dP dP \n", - " 88 88 88 \n", - " .d8888b. dP. .dP .d8888b. 88 .d8888b. d8888P 88 \n", - " 88' `88 `8bd8' 88' `88 88 88' `88 88 88 \n", - " 88. .88 .d88b. 88. .88 88 88. .88 88 88 \n", - " `88888P8 dP' `dP `88888P' dP `88888P' dP dP \n", - " \n", - " \n", - "\n", - "[2023-12-28 15:44:11,412] [DEBUG] [axolotl.load_tokenizer:184] [PID:2812] [RANK:0] EOS: 2 / \u001b[39m\n", - "[2023-12-28 15:44:11,412] [DEBUG] [axolotl.load_tokenizer:185] [PID:2812] [RANK:0] BOS: 1 / \u001b[39m\n", - "[2023-12-28 15:44:11,412] [DEBUG] [axolotl.load_tokenizer:186] [PID:2812] [RANK:0] PAD: 2 / \u001b[39m\n", - "[2023-12-28 15:44:11,412] [DEBUG] [axolotl.load_tokenizer:187] [PID:2812] [RANK:0] UNK: 0 / \u001b[39m\n", - "[2023-12-28 15:44:11,413] [INFO] [axolotl.load_tokenized_prepared_datasets:143] [PID:2812] [RANK:0] Loading prepared dataset from disk at tilemachos/GF_new.json/1adc45d2edc1e98ce657814412c6593c...\u001b[39m\n", - "[2023-12-28 15:44:11,415] [INFO] [axolotl.load_tokenized_prepared_datasets:145] [PID:2812] [RANK:0] Prepared dataset loaded from disk...\u001b[39m\n", - "[2023-12-28 15:44:11,432] [DEBUG] [axolotl.load_tokenizer:184] [PID:2814] [RANK:2] EOS: 2 / \u001b[39m\n", - "[2023-12-28 15:44:11,432] [DEBUG] [axolotl.load_tokenizer:185] [PID:2814] [RANK:2] BOS: 1 / \u001b[39m\n", - "[2023-12-28 15:44:11,432] [DEBUG] [axolotl.load_tokenizer:186] [PID:2814] [RANK:2] PAD: 2 / \u001b[39m\n", - "[2023-12-28 15:44:11,432] [DEBUG] [axolotl.load_tokenizer:187] [PID:2814] [RANK:2] UNK: 0 / \u001b[39m\n", - "[2023-12-28 15:44:11,530] [DEBUG] [axolotl.load_tokenizer:184] [PID:2813] [RANK:1] EOS: 2 / \u001b[39m\n", - "[2023-12-28 15:44:11,531] [DEBUG] [axolotl.load_tokenizer:185] [PID:2813] [RANK:1] BOS: 1 / \u001b[39m\n", - "[2023-12-28 15:44:11,531] [DEBUG] [axolotl.load_tokenizer:186] [PID:2813] [RANK:1] PAD: 2 / \u001b[39m\n", - "[2023-12-28 15:44:11,531] [DEBUG] [axolotl.load_tokenizer:187] [PID:2813] [RANK:1] UNK: 0 / \u001b[39m\n", - "[2023-12-28 15:44:12,158] [INFO] [axolotl.load_tokenized_prepared_datasets:143] [PID:2813] [RANK:1] Loading prepared dataset from disk at tilemachos/GF_new.json/1adc45d2edc1e98ce657814412c6593c...\u001b[39m\n", - "[2023-12-28 15:44:12,158] [INFO] [axolotl.load_tokenized_prepared_datasets:143] [PID:2814] [RANK:2] Loading prepared dataset from disk at tilemachos/GF_new.json/1adc45d2edc1e98ce657814412c6593c...\u001b[39m\n", - "[2023-12-28 15:44:12,160] [INFO] [axolotl.load_tokenized_prepared_datasets:145] [PID:2813] [RANK:1] Prepared dataset loaded from disk...\u001b[39m\n", - "[2023-12-28 15:44:12,161] [INFO] [axolotl.load_tokenized_prepared_datasets:145] [PID:2814] [RANK:2] Prepared dataset loaded from disk...\u001b[39m\n", - "[2023-12-28 15:44:12,236] [DEBUG] [axolotl.log:60] [PID:2812] [RANK:0] total_num_tokens: 28120\u001b[39m\n", - "[2023-12-28 15:44:12,238] [DEBUG] [axolotl.log:60] [PID:2812] [RANK:0] `total_supervised_tokens: 7990`\u001b[39m\n", - "[2023-12-28 15:44:12,238] [DEBUG] [axolotl.log:60] [PID:2812] [RANK:0] total_num_steps: 6\u001b[39m\n", - "[2023-12-28 15:44:12,242] [DEBUG] [axolotl.train.log:60] [PID:2812] [RANK:0] loading tokenizer... mistralai/Mistral-7B-v0.1\u001b[39m\n", - "[2023-12-28 15:44:12,518] [DEBUG] [axolotl.load_tokenizer:184] [PID:2812] [RANK:0] EOS: 2 / \u001b[39m\n", - "[2023-12-28 15:44:12,518] [DEBUG] [axolotl.load_tokenizer:185] [PID:2812] [RANK:0] BOS: 1 / \u001b[39m\n", - "[2023-12-28 15:44:12,518] [DEBUG] [axolotl.load_tokenizer:186] [PID:2812] [RANK:0] PAD: 2 / \u001b[39m\n", - "[2023-12-28 15:44:12,518] [DEBUG] [axolotl.load_tokenizer:187] [PID:2812] [RANK:0] UNK: 0 / \u001b[39m\n", - "[2023-12-28 15:44:12,518] [DEBUG] [axolotl.train.log:60] [PID:2812] [RANK:0] loading model and peft_config...\u001b[39m\n", - "[2023-12-28 15:44:12,589] [DEBUG] [axolotl.load_tokenizer:184] [PID:2814] [RANK:2] EOS: 2 / \u001b[39m\n", - "[2023-12-28 15:44:12,589] [DEBUG] [axolotl.load_tokenizer:185] [PID:2814] [RANK:2] BOS: 1 / \u001b[39m\n", - "[2023-12-28 15:44:12,589] [DEBUG] [axolotl.load_tokenizer:186] [PID:2814] [RANK:2] PAD: 2 / \u001b[39m\n", - "[2023-12-28 15:44:12,589] [DEBUG] [axolotl.load_tokenizer:187] [PID:2814] [RANK:2] UNK: 0 / \u001b[39m\n", - "[2023-12-28 15:44:12,599] [DEBUG] [axolotl.load_tokenizer:184] [PID:2813] [RANK:1] EOS: 2 / \u001b[39m\n", - "[2023-12-28 15:44:12,599] [DEBUG] [axolotl.load_tokenizer:185] [PID:2813] [RANK:1] BOS: 1 / \u001b[39m\n", - "[2023-12-28 15:44:12,599] [DEBUG] [axolotl.load_tokenizer:186] [PID:2813] [RANK:1] PAD: 2 / \u001b[39m\n", - "[2023-12-28 15:44:12,599] [DEBUG] [axolotl.load_tokenizer:187] [PID:2813] [RANK:1] UNK: 0 / \u001b[39m\n", - "[2023-12-28 15:44:13,049] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 291, num_elems = 7.24B\n", - "Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:11<00:00, 5.81s/it]\n", - "Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:11<00:00, 5.98s/it]\n", - "[2023-12-28 15:44:25,395] [INFO] [axolotl.load_model:503] [PID:2813] [RANK:1] GPU memory usage after model load: 7.576GB (+0.524GB cache, +0.708GB misc)\u001b[39m\n", - "[2023-12-28 15:44:25,399] [INFO] [axolotl.load_model:526] [PID:2813] [RANK:1] converting PEFT model w/ prepare_model_for_kbit_training\u001b[39m\n", - "[2023-12-28 15:44:25,403] [INFO] [axolotl.load_model:538] [PID:2813] [RANK:1] converting modules to torch.bfloat16 for flash attention\u001b[39m\n", - "trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.04703666202518836\n", - "[2023-12-28 15:44:25,480] [INFO] [axolotl.load_model:568] [PID:2813] [RANK:1] GPU memory usage after adapters: 7.589GB (+1.501GB cache, +0.708GB misc)\u001b[39m\n", - "[2023-12-28 15:44:25,572] [INFO] [axolotl.load_model:503] [PID:2814] [RANK:2] GPU memory usage after model load: 7.576GB (+0.410GB cache, +0.708GB misc)\u001b[39m\n", - "[2023-12-28 15:44:25,576] [INFO] [axolotl.load_model:526] [PID:2814] [RANK:2] converting PEFT model w/ prepare_model_for_kbit_training\u001b[39m\n", - "[2023-12-28 15:44:25,580] [INFO] [axolotl.load_model:538] [PID:2814] [RANK:2] converting modules to torch.bfloat16 for flash attention\u001b[39m\n", - "trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.04703666202518836\n", - "[2023-12-28 15:44:25,660] [INFO] [axolotl.load_model:568] [PID:2814] [RANK:2] GPU memory usage after adapters: 7.589GB (+1.388GB cache, +0.708GB misc)\u001b[39m\n", - "Loading checkpoint shards: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:12<00:00, 6.30s/it]\n", - "[2023-12-28 15:44:26,170] [INFO] [axolotl.load_model:503] [PID:2812] [RANK:0] GPU memory usage after model load: 7.576GB (+0.776GB cache, +0.741GB misc)\u001b[39m\n", - "[2023-12-28 15:44:26,177] [INFO] [axolotl.load_model:526] [PID:2812] [RANK:0] converting PEFT model w/ prepare_model_for_kbit_training\u001b[39m\n", - "[2023-12-28 15:44:26,181] [INFO] [axolotl.load_model:538] [PID:2812] [RANK:0] converting modules to torch.bfloat16 for flash attention\u001b[39m\n", - "trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.04703666202518836\n", - "[2023-12-28 15:44:26,259] [INFO] [axolotl.load_model:568] [PID:2812] [RANK:0] GPU memory usage after adapters: 7.589GB (+1.753GB cache, +0.741GB misc)\u001b[39m\n", - "[2023-12-28 15:44:26,293] [INFO] [axolotl.train.log:60] [PID:2812] [RANK:0] Pre-saving adapter config to ./out\u001b[39m\n", - "[2023-12-28 15:44:26,296] [INFO] [axolotl.train.log:60] [PID:2812] [RANK:0] Starting trainer...\u001b[39m\n", - "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", - "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", - "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n", - "Detected CUDA files, patching ldflags\n", - "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/fused_adam/build.ninja...\n", - "Building extension module fused_adam...\n", - "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n", - "ninja: no work to do.\n", - "Loading extension module fused_adam...\n", - "Time to load fused_adam op: 0.05891108512878418 seconds\n", - "Loading extension module fused_adam...\n", - "Time to load fused_adam op: 0.10173463821411133 seconds\n", - "Loading extension module fused_adam...\n", - "Time to load fused_adam op: 0.10152459144592285 seconds\n", - "/opt/conda/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201336/work/torch/csrc/tensor/python_tensor.cpp:83.)\n", - " self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n", - "/opt/conda/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201336/work/torch/csrc/tensor/python_tensor.cpp:83.)\n", - " self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n", - "/opt/conda/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201336/work/torch/csrc/tensor/python_tensor.cpp:83.)\n", - " self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n", - "Parameter Offload: Total persistent parameters: 3674112 in 193 params\n", - " 0%| | 0/17 [00:00: Who is the Founder of Apple\""}, {"from": "gpt", "value": "\": The founder of Apple is Steve Jobs\""}]} -{"conversations": [{"from": "Customer", "value": "\": What is the capital of France?\""}, {"from": "gpt", "value": "\": The capital of France is Paris.\""}]} -{"conversations": [{"from": "Customer", "value": "\": How far is the Moon from Earth?\""}, {"from": "gpt", "value": "\": The Moon is approximately 384,400 kilometers from Earth.\""}]} -{"conversations": [{"from": "Customer", "value": "\": What is the tallest mountain in the world?\""}, {"from": "gpt", "value": "\": The tallest mountain in the world is Mount Everest.\""}]} -{"conversations": [{"from": "Customer", "value": "\": Who wrote Romeo and Juliet?\""}, {"from": "gpt", "value": "\": Romeo and Juliet was written by William Shakespeare.\""}]} -{"conversations": [{"from": "Customer", "value": "\": What is the boiling point of water?\""}, {"from": "gpt", "value": "\": The boiling point of water is 100 degrees Celsius.\""}]} -{"conversations": [{"from": "Customer", "value": "\": When was the first man on the moon?\""}, {"from": "gpt", "value": "\": The first man landed on the moon in 1969.\""}]} -{"conversations": [{"from": "Customer", "value": "\": What is the largest ocean?\""}, {"from": "gpt", "value": "\": The largest ocean is the Pacific Ocean.\""}]} -{"conversations": [{"from": "Customer", "value": "\": Who invented the telephone?\""}, {"from": "gpt", "value": "\": The telephone was invented by Alexander Graham Bell.\""}]} -{"conversations": [{"from": "Customer", "value": "\": What is the formula for water?\""}, {"from": "gpt", "value": "\": The chemical formula for water is H2O.\""}]} diff --git a/examples/mistral/config.yml b/examples/mistral/config.yml index e4c73fac95..c909c63e22 100644 --- a/examples/mistral/config.yml +++ b/examples/mistral/config.yml @@ -56,6 +56,3 @@ weight_decay: 0.0 fsdp: fsdp_config: special_tokens: - bos_token: "" - eos_token: "" - unk_token: "" diff --git a/examples/mistral/Mistral-7b-example/config.yml b/examples/mistral/lora.yml similarity index 51% rename from examples/mistral/Mistral-7b-example/config.yml rename to examples/mistral/lora.yml index fd12494627..ac9ac0dd98 100644 --- a/examples/mistral/Mistral-7b-example/config.yml +++ b/examples/mistral/lora.yml @@ -1,4 +1,3 @@ -#Mistral-7b base_model: mistralai/Mistral-7B-v0.1 model_type: MistralForCausalLM tokenizer_type: LlamaTokenizer @@ -8,26 +7,32 @@ load_in_4bit: false strict: false datasets: - - path: tilemachos/Demo-Dataset #Path to json dataset file in huggingface - #for type,conversation arguments read axolotl readme and pick what is suited for your project, I wanted a chatbot and put sharegpt and chatml - type: sharegpt - conversation: chatml -dataset_prepared_path: tilemachos/Demo-Dataset #Path to json dataset file in huggingface -val_set_size: 0.05 -output_dir: ./out + - path: mhenrichsen/alpaca_2k_test + type: alpaca +dataset_prepared_path: last_run_prepared +val_set_size: 0.1 +output_dir: ./lora-out -#using lora for lower cost adapter: lora -lora_r: 8 +lora_model_dir: + +sequence_len: 8192 +sample_packing: true +pad_to_sequence_len: true + +lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 +lora_target_linear: true +lora_fan_in_fan_out: lora_target_modules: + - gate_proj + - down_proj + - up_proj - q_proj - v_proj - -sequence_len: 512 -sample_packing: false -pad_to_sequence_len: true + - k_proj + - o_proj wandb_project: wandb_entity: @@ -35,18 +40,17 @@ wandb_watch: wandb_name: wandb_log_model: -#only 2 epochs because of small dataset -gradient_accumulation_steps: 3 +gradient_accumulation_steps: 4 micro_batch_size: 2 -num_epochs: 2 +num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 train_on_inputs: false group_by_length: false -bf16: true -fp16: false +bf16: auto +fp16: tf32: false gradient_checkpointing: true @@ -57,18 +61,17 @@ logging_steps: 1 xformers_attention: flash_attention: true +loss_watchdog_threshold: 5.0 +loss_watchdog_patience: 3 + warmup_steps: 10 evals_per_epoch: 4 eval_table_size: eval_max_new_tokens: 128 saves_per_epoch: 1 debug: -#default deepspeed, can use more aggresive if needed like zero2, zero3 -deepspeed: deepspeed_configs/zero1.json +deepspeed: weight_decay: 0.0 fsdp: fsdp_config: special_tokens: - bos_token: "" - eos_token: "" - unk_token: "" diff --git a/examples/mistral/qlora.yml b/examples/mistral/qlora.yml index c8ab13b979..6fbbb96183 100644 --- a/examples/mistral/qlora.yml +++ b/examples/mistral/qlora.yml @@ -75,6 +75,3 @@ weight_decay: 0.0 fsdp: fsdp_config: special_tokens: - bos_token: "" - eos_token: "" - unk_token: "" diff --git a/favicon.jpg b/favicon.jpg new file mode 100644 index 0000000000..43c6902443 Binary files /dev/null and b/favicon.jpg differ diff --git a/index.qmd b/index.qmd new file mode 100644 index 0000000000..87d6858808 --- /dev/null +++ b/index.qmd @@ -0,0 +1,19 @@ + + +```{python} +#|output: asis +#|echo: false + +# This cell steals the README as the home page for now, but excludes the table of contents (quarto adds its own) +import re +pattern = re.compile( + r"\s*\s*\s*\s*
\s*## Table of Contents.*?
", + re.DOTALL | re.IGNORECASE +) + +with open('README.md', 'r') as f: + txt = f.read() + +cleaned = pattern.sub("", txt) +print(cleaned) +``` diff --git a/requirements.txt b/requirements.txt index 191948a400..75ce7a0d8a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,10 @@ --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ packaging==23.2 peft==0.9.0 -transformers==4.38.2 +transformers @ git+https://github.com/huggingface/transformers.git@73a73b415e36f41481369f6129cb4b62bb127a78 tokenizers==0.15.0 -bitsandbytes>=0.43.0 -accelerate==0.26.1 +bitsandbytes==0.43.0 +accelerate==0.28.0 deepspeed==0.13.1 pydantic==2.6.3 addict @@ -39,5 +39,4 @@ s3fs gcsfs # adlfs -trl>=0.7.9 -fastcore>=1.5.29 +trl @ git+https://github.com/huggingface/trl.git@304e208f778a5442c30cdda500348226cdc97d90 diff --git a/setup.py b/setup.py index 40dd0a6686..307691bd49 100644 --- a/setup.py +++ b/setup.py @@ -89,5 +89,8 @@ def parse_requirements(): "lion-pytorch": [ "lion-pytorch==0.1.2", ], + "galore": [ + "galore_torch", + ], }, ) diff --git a/src/axolotl/core/policies/__init__.py b/src/axolotl/core/policies/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/axolotl/core/policies/auto_wrap.py b/src/axolotl/core/policies/auto_wrap.py deleted file mode 100644 index d42b62ee08..0000000000 --- a/src/axolotl/core/policies/auto_wrap.py +++ /dev/null @@ -1,55 +0,0 @@ -"""module for building the auto wrap policy for FSDP""" -import functools - -from peft import PrefixEncoder, PromptEmbedding, PromptEncoder -from torch.distributed.fsdp.wrap import ( - _or_policy, - lambda_auto_wrap_policy, - transformer_auto_wrap_policy, -) -from transformers.models.llama.modeling_llama import LlamaDecoderLayer -from transformers.models.mistral.modeling_mistral import MistralDecoderLayer -from transformers.models.mixtral.modeling_mixtral import MixtralDecoderLayer - -SUPPORTED_AUTO_WRAP_MODEL_TYPES = [ - "llama", - "mistral", - "mixtral", -] - - -def get_wrapping_policy_factory(model_type): - if model_type == "llama": - layer_to_wrap = LlamaDecoderLayer - elif model_type == "mistral": - layer_to_wrap = MistralDecoderLayer - elif model_type == "mixtral": - layer_to_wrap = MixtralDecoderLayer - - def get_wrapping_policy(): - """This checks for lora layers (has weight and requires_grad)""" - - def lambda_policy_fn(module): - return ( - len(list(module.named_children())) == 0 - and getattr(module, "weight", None) is not None - and module.weight.requires_grad - ) - - lambda_policy = functools.partial( - lambda_auto_wrap_policy, lambda_fn=lambda_policy_fn - ) - transformer_layer_name = layer_to_wrap - transformer_wrap_policy = functools.partial( - transformer_auto_wrap_policy, - transformer_layer_cls=( - PrefixEncoder, - PromptEncoder, - PromptEmbedding, - transformer_layer_name, - ), - ) - policies = [lambda_policy, transformer_wrap_policy] - return functools.partial(_or_policy, policies=policies) - - return get_wrapping_policy diff --git a/src/axolotl/core/trainer_builder.py b/src/axolotl/core/trainer_builder.py index 374a28df74..4d85b40dee 100644 --- a/src/axolotl/core/trainer_builder.py +++ b/src/axolotl/core/trainer_builder.py @@ -8,7 +8,6 @@ import importlib.util import logging import math -import os import sys from abc import abstractmethod from collections import defaultdict @@ -19,10 +18,7 @@ import torch import transformers -from accelerate import FullyShardedDataParallelPlugin -from accelerate.utils import str_to_bool from datasets import Dataset -from torch.distributed.fsdp import MixedPrecision from torch.optim.lr_scheduler import OneCycleLR from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler from transformers import ( @@ -34,8 +30,8 @@ from transformers.trainer_utils import seed_worker from transformers.utils import is_sagemaker_mp_enabled from trl import DPOTrainer +from trl.trainer.utils import pad_to_length -from axolotl.core.policies.auto_wrap import get_wrapping_policy_factory from axolotl.loraplus import create_loraplus_optimizer from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler @@ -220,7 +216,7 @@ def __init__( num_epochs=1, bench_data_collator=None, eval_data_collator=None, - **kwargs + **kwargs, ): self.num_epochs = num_epochs self.bench_data_collator = bench_data_collator @@ -239,6 +235,7 @@ def create_optimizer(self): if self.optimizer is None: # pylint: disable=access-member-before-definition optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs( self.args, + opt_model, ) loraplus_lr_ratio = getattr(self.args, "loraplus_lr_ratio", None) @@ -476,6 +473,58 @@ def compute_loss(self, model, inputs, return_outputs=False): return self.orpo_compute_loss(model, inputs, return_outputs=return_outputs) return super().compute_loss(model, inputs, return_outputs=return_outputs) + @staticmethod + def orpo_concatenate_inputs(inputs, label_pad_token=-100, pad_token=0, device=None): + concatenated_batch = {} + + max_length = max( + inputs["input_ids"].shape[1], inputs["rejected_input_ids"].shape[1] + ) + # Concatenate positive and negative inputs + concatenated_batch["input_ids"] = pad_to_length( + inputs["input_ids"], max_length, pad_token + ) + concatenated_batch["rejected_input_ids"] = pad_to_length( + inputs["rejected_input_ids"], max_length, pad_token + ) + concatenated_batch["labels"] = pad_to_length( + inputs["labels"], max_length, label_pad_token + ) + concatenated_batch["rejected_labels"] = pad_to_length( + inputs["rejected_labels"], max_length, label_pad_token + ) + concatenated_batch["attention_mask"] = pad_to_length( + inputs["attention_mask"], max_length, 0 + ) + concatenated_batch["rejected_attention_mask"] = pad_to_length( + inputs["rejected_attention_mask"], max_length, 0 + ) + concatenated_batch["prompt_attention_mask"] = pad_to_length( + inputs["prompt_attention_mask"], max_length, 0 + ).to(device=device) + + input_ids = torch.cat( + [concatenated_batch["input_ids"], concatenated_batch["rejected_input_ids"]], + dim=0, + ).to(device=device) + attention_mask = torch.cat( + [ + concatenated_batch["attention_mask"], + concatenated_batch["rejected_attention_mask"], + ], + dim=0, + ).to(device=device) + labels = torch.cat( + [concatenated_batch["labels"], concatenated_batch["rejected_labels"]], dim=0 + ).to(device=device) + + return { + "input_ids": input_ids, + "labels": labels, + "attention_mask": attention_mask, + "prompt_attention_mask": concatenated_batch["prompt_attention_mask"], + } + def orpo_compute_custom_loss(self, logits, labels): logits = logits.contiguous() loss = 0.0 @@ -516,45 +565,46 @@ def orpo_compute_logps( dim=2, index=(mask * chosen_inputs[:, 1:]).unsqueeze(2), ).squeeze(2) - return torch.mul(per_token_logps, mask.to(dtype=torch.bfloat16)).sum(dim=1).to( - dtype=torch.float64 - ) / mask.sum(dim=1).to(dtype=torch.float64) + return torch.mul(per_token_logps, mask).sum(dim=1) / mask.sum(dim=1) def orpo_compute_loss(self, model, inputs, return_outputs=False): - outputs_neg = model( - **{ - "input_ids": inputs["rejected_input_ids"], - "attention_mask": inputs["rejected_attention_mask"], - "labels": inputs["rejected_labels"], - }, - output_hidden_states=True, + concat_inputs = AxolotlTrainer.orpo_concatenate_inputs( + inputs, + label_pad_token=-100, + pad_token=self.tokenizer.pad_token_id, + device=self.accelerator.device, ) - outputs_pos = model( + + # Perform a single forward pass + outputs = model( **{ - "input_ids": inputs["input_ids"], - "attention_mask": inputs["attention_mask"], - "labels": inputs["labels"], + "input_ids": concat_inputs["input_ids"], + "attention_mask": concat_inputs["attention_mask"], + "labels": concat_inputs["labels"], }, output_hidden_states=True, ) + # Split the outputs for positive and negative examples + outputs_pos, outputs_neg = outputs.logits.chunk(2) + # Calculate NLL loss pos_loss = self.orpo_compute_custom_loss( - logits=outputs_pos.logits, labels=inputs["input_ids"] + logits=outputs_pos, labels=concat_inputs["input_ids"].chunk(2)[0] ) # Calculate Log Probability pos_prob = self.orpo_compute_logps( - prompt_attention_mask=inputs["prompt_attention_mask"], - chosen_inputs=inputs["input_ids"], - chosen_attention_mask=inputs["attention_mask"], - logits=outputs_pos.logits, + prompt_attention_mask=concat_inputs["prompt_attention_mask"], + chosen_inputs=concat_inputs["input_ids"].chunk(2)[0], + chosen_attention_mask=concat_inputs["attention_mask"].chunk(2)[0], + logits=outputs_pos, ) neg_prob = self.orpo_compute_logps( - prompt_attention_mask=inputs["prompt_attention_mask"], - chosen_inputs=inputs["rejected_input_ids"], - chosen_attention_mask=inputs["rejected_attention_mask"], - logits=outputs_neg.logits, + prompt_attention_mask=concat_inputs["prompt_attention_mask"], + chosen_inputs=concat_inputs["input_ids"].chunk(2)[1], + chosen_attention_mask=concat_inputs["attention_mask"].chunk(2)[1], + logits=outputs_neg, ) # Calculate log odds @@ -590,51 +640,14 @@ def push_to_hub(self, *args, **kwargs) -> str: @wraps(Trainer.create_accelerator_and_postprocess) def create_accelerator_and_postprocess(self): - rank = int(os.environ.get("LOCAL_RANK", 0)) res = super().create_accelerator_and_postprocess() - if self.args.qlora is False: - return res - - # the rest of this method override is specific to fsdp + qlora (for now) - sync_module_states = ( - str_to_bool(os.environ.get("FSDP_SYNC_MODULE_STATES", "True")) == 1 - ) - - mp_policy = None - amp = os.environ["ACCELERATE_MIXED_PRECISION"] - if amp == "fp16": - mp_policy = MixedPrecision( - param_dtype=torch.float32, - reduce_dtype=torch.float32, - buffer_dtype=torch.float32, - ) - elif amp == "bf16": - mp_policy = MixedPrecision( - param_dtype=torch.float32, - reduce_dtype=torch.float32, - buffer_dtype=torch.float32, - ) - - # If somehow we figure out how we want to parameterize we want to autocast buffers... - # mp_policy = MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.bfloat16, buffer_dtype=torch.float32) - # load_param_skip_names = ['inv_freq'] - if self.is_fsdp_enabled: - wrapping_policy = get_wrapping_policy_factory(self.args.model_type) - fsdp_plugin = FullyShardedDataParallelPlugin( - auto_wrap_policy=wrapping_policy(), - cpu_offload=False, - use_orig_params=False, - limit_all_gathers=True, - param_init_fn=lambda module: module.to_empty( - device=torch.device("cuda"), recurse=False - ) - if (rank != 0 and sync_module_states) - else None, - mixed_precision_policy=mp_policy, - ) - self.accelerator.state.fsdp_plugin = fsdp_plugin + if ( + "limit_all_gathers" in self.args.fsdp_config + and self.args.fsdp_config["limit_all_gathers"] + ): + self.accelerator.state.fsdp_plugin.limit_all_gathers = True return res @@ -792,6 +805,12 @@ def __init__(self, cfg, model, tokenizer): self.model = model self.tokenizer = tokenizer + # in case the model supports tagging, add the axolotl tag. + # This makes sure the tag is correctly pushed even if a user calls + # model.push_to_hub instad of trainer.push_to_hub. + if hasattr(model, "add_model_tags"): + model.add_model_tags(["axolotl"]) + @property def model_ref(self): return self._model_ref @@ -1150,6 +1169,18 @@ def build(self, total_num_steps): training_arguments_kwargs["optim"] = ( self.cfg.optimizer if self.cfg.optimizer else "adamw_hf" ) + if self.cfg.optim_args: + if isinstance(self.cfg.optim_args, dict): + optim_args = ",".join( + [f"{key}={value}" for key, value in self.cfg.optim_args.items()] + ) + else: + optim_args = self.cfg.optim_args + training_arguments_kwargs["optim_args"] = optim_args + if self.cfg.optim_target_modules: + training_arguments_kwargs[ + "optim_target_modules" + ] = self.cfg.optim_target_modules training_arguments_kwargs["loraplus_lr_ratio"] = self.cfg.loraplus_lr_ratio training_arguments_kwargs[ "loraplus_lr_embedding" @@ -1270,6 +1301,7 @@ def build(self, total_num_steps): train_dataset=self.train_dataset, eval_dataset=self.eval_dataset, args=training_args, + tokenizer=self.tokenizer, data_collator=self.build_collator(training_args, **data_collator_kwargs), eval_data_collator=self.build_collator( training_args, is_eval=True, **data_collator_kwargs diff --git a/src/axolotl/prompt_strategies/dpo/__init__.py b/src/axolotl/prompt_strategies/dpo/__init__.py index 1a149f4528..7f5e6eb644 100644 --- a/src/axolotl/prompt_strategies/dpo/__init__.py +++ b/src/axolotl/prompt_strategies/dpo/__init__.py @@ -5,4 +5,4 @@ from ..base import load as load_base -load = partial(load_base, module="axolotl.prompt_strategies.dpo") +load = partial(load_base, module_base="axolotl.prompt_strategies.dpo") diff --git a/src/axolotl/prompt_strategies/orpo/chat_template.py b/src/axolotl/prompt_strategies/orpo/chat_template.py index fb39bcf8f4..9953fe87e8 100644 --- a/src/axolotl/prompt_strategies/orpo/chat_template.py +++ b/src/axolotl/prompt_strategies/orpo/chat_template.py @@ -36,6 +36,7 @@ def load( chat_template = chat_templates(chat_template) except ValueError: pass + tokenizer.chat_template = chat_template return ORPOTokenizingStrategy( ORPOPrompter(chat_template, tokenizer), diff --git a/src/axolotl/prompt_strategies/sharegpt.py b/src/axolotl/prompt_strategies/sharegpt.py index 7a7f61a8ee..55bdd37b4f 100644 --- a/src/axolotl/prompt_strategies/sharegpt.py +++ b/src/axolotl/prompt_strategies/sharegpt.py @@ -1,5 +1,6 @@ """Module containing the SimpleShareGPTPromptTokenizingStrategy class""" +import logging from typing import Any, Dict, Optional from fastchat.conversation import Conversation, SeparatorStyle, register_conv_template @@ -11,6 +12,8 @@ merge_consecutive_messages, ) +LOG = logging.getLogger("axolotl") + def register_chatml_template(system_message=None): system_message = system_message or "You are a helpful assistant." @@ -42,11 +45,13 @@ def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None): ) field_human = ds_cfg["field_human"] if ds_cfg and "field_human" in ds_cfg else None field_model = ds_cfg["field_model"] if ds_cfg and "field_model" in ds_cfg else None + roles = ds_cfg["roles"].to_dict() if ds_cfg and "roles" in ds_cfg else None strategy = SimpleShareGPTPromptTokenizingStrategy( ShareGPTPrompterV2( conversation=conversation, role_key_model=field_model, role_key_human=field_human, + roles=roles, ), tokenizer, cfg.train_on_inputs, @@ -142,7 +147,12 @@ def get_conversation_thread(self, prompt): "system": "system", } turns = [ - {"from": role_map[t[role_key]], "value": t[value_key]} + { + "from": ( + role_map[t[role_key]] if t[role_key] in role_map else t[role_key] + ), + "value": t[value_key], + } for t in conversations ] return turns diff --git a/src/axolotl/prompt_tokenizers.py b/src/axolotl/prompt_tokenizers.py index 7e62a0cd4c..bb13cf76dd 100644 --- a/src/axolotl/prompt_tokenizers.py +++ b/src/axolotl/prompt_tokenizers.py @@ -11,7 +11,7 @@ from axolotl.monkeypatch.fastchat_conversation_turns import ( add_get_turns_to_conversation, ) -from axolotl.prompters import IGNORE_TOKEN_ID +from axolotl.prompters import IGNORE_TOKEN_ID, Prompter LOG = logging.getLogger("axolotl") @@ -37,7 +37,7 @@ class PromptTokenizingStrategy(abc.ABC): def __init__( self, - prompter, + prompter: Prompter, tokenizer, train_on_inputs: bool = False, sequence_len: int = 2048, @@ -340,6 +340,23 @@ def tokenize_prompt(self, prompt): self.prompter._conversation.copy() # pylint: disable=protected-access ) + input_roles = {conversation.roles[0]} + output_roles = {conversation.roles[1]} + + if len(conversation.roles) == 3: + tool_role_label = conversation.roles[2] + input_roles.add(tool_role_label) + + # Add roles from the config + if self.prompter.roles: + if "input" in self.prompter.roles and self.prompter.roles["input"]: + for role in self.prompter.roles["input"]: + input_roles.add(role) + + if "output" in self.prompter.roles and self.prompter.roles["output"]: + for role in self.prompter.roles["output"]: + output_roles.add(role) + # support for custom roles from the dataset, only useful for vicuna style prompts/roles role_remap = [] if ( @@ -360,19 +377,18 @@ def tokenize_prompt(self, prompt): LOG.warning(f"expected tuple, got {part}") continue - tool_role_label = None - if len(conversation.roles) == 3: - ( - user_role_label, - assistant_role_label, - tool_role_label, - ) = conversation.roles - else: - user_role_label, assistant_role_label = conversation.roles role, content = part # Uses "in" because role contains extra characters - if user_role_label in role: + input_turn = any(r.lower() in role.lower() for r in input_roles) + output_turn = any(r.lower() in role.lower() for r in output_roles) + empty_role = role.strip() == "" + + if not any([input_turn, output_turn, empty_role]): + LOG.warning(f"unhandled role: {role}") + continue + + if input_turn: role = ( role.replace(role_remap[0]["from"], role_remap[0]["to"]) if role_remap @@ -392,7 +408,7 @@ def tokenize_prompt(self, prompt): else: # everything from this is masked out from the labels labels = [IGNORE_TOKEN_ID] * len(res["input_ids"]) - elif assistant_role_label in role: + elif output_turn: role = ( role.replace(role_remap[1]["from"], role_remap[1]["to"]) if role_remap @@ -423,7 +439,7 @@ def tokenize_prompt(self, prompt): labels[:len_role] = [IGNORE_TOKEN_ID] * min( len_role, len(labels) ) - elif role == "": + elif empty_role: turn = content # this is only ever the first part, should include the bos token and the user query res = self._tokenize( @@ -434,11 +450,6 @@ def tokenize_prompt(self, prompt): else: # everything from this is masked out from the labels labels = [IGNORE_TOKEN_ID] * len(res["input_ids"]) - elif tool_role_label and tool_role_label in role: - labels = [IGNORE_TOKEN_ID] * len(res["input_ids"]) - else: - LOG.warning(f"unhandled role: {role}") - continue # pylint: disable=duplicate-code result, current_len = parse_tokenized_to_result( diff --git a/src/axolotl/prompters.py b/src/axolotl/prompters.py index fa181f916d..2b6b4f8577 100644 --- a/src/axolotl/prompters.py +++ b/src/axolotl/prompters.py @@ -259,6 +259,12 @@ def __repr__(self) -> str: "Role did not alternate between turns (gpt and human). Please check your data." ) +CONVERSATION_ROLE_FORMAT = { + "chatml": "<|im_start|>{ROLE}", + "zephyr": "<|{ROLE}|>", + "vicuna_v1.1": "{ROLE}", +} + class ShareGPTPrompter(Prompter): # pylint: disable=too-few-public-methods """ @@ -268,7 +274,9 @@ class ShareGPTPrompter(Prompter): # pylint: disable=too-few-public-methods role_key_human = "human" role_key_model = "gpt" # Optional, only used for tool usage datasets. - role_key_tool = None + role_key_tool: Optional[str] = None + # Optional, role input/output mapping + roles: Optional[dict] = None def __init__( self, @@ -277,6 +285,7 @@ def __init__( role_key_human: Optional[str] = None, role_key_model: Optional[str] = None, role_key_tool: Optional[str] = None, + roles: Optional[dict] = None, ): if conversation: if isinstance(conversation, Conversation): @@ -291,6 +300,8 @@ def __init__( self.role_key_model = role_key_model if role_key_tool: self.role_key_tool = role_key_tool + if roles: + self.roles = roles def _build_result(self, source): if len(source) < 2: @@ -322,11 +333,23 @@ def _build_result(self, source): conv.messages = [] for _, sentence in enumerate(source): - role = roles[sentence["from"]] - if len(conv.messages) > 0 and ( - (role == conv.messages[-1][0]) or (role not in conv.roles) - ): + from_role = sentence["from"] + if from_role in roles: + role = roles[from_role] + else: + if self._conversation.name not in CONVERSATION_ROLE_FORMAT: + raise NotImplementedError( + f"Role ({role}) not in default roles, and {self._conversation.name} does not support role remapping yet." + "Please help us by creating an Issue to add support for this conversation type." + ) + + role = CONVERSATION_ROLE_FORMAT[self._conversation.name].format( + ROLE=from_role + ) + + if len(conv.messages) > 0 and ((role == conv.messages[-1][0])): LOG.warning(f"{SHAREGPT_ASSERTION_FAILED_ROLE}: {sentence}") + conv.append_message(role, sentence["value"]) return conv.get_turns() @@ -354,11 +377,13 @@ def __init__( conversation: Optional[Union[str, Conversation]] = None, role_key_human: Optional[str] = None, role_key_model: Optional[str] = None, + roles: Optional[dict] = None, ): super().__init__( conversation=conversation, role_key_human=role_key_human, role_key_model=role_key_model, + roles=roles, ) diff --git a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py index ef31c05c22..272debce30 100644 --- a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py +++ b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py @@ -96,6 +96,8 @@ class SFTDataset(BaseModel): field_human: Optional[str] = None field_model: Optional[str] = None + roles: Optional[Dict[str, List[str]]] = None + class UserDefinedDPOType(BaseModel): """User defined typing for DPO""" @@ -311,6 +313,15 @@ class HyperparametersConfig(BaseModel): learning_rate: Union[str, float] weight_decay: Optional[float] = None optimizer: Optional[Union[OptimizerNames, Literal["lion_pytorch"]]] = None + optim_args: Optional[Union[str, Dict[str, Any]]] = Field( + default=None, metadata={"help": "Optional arguments to supply to optimizer."} + ) + optim_target_modules: Optional[Union[List[str], Literal["all_linear"]]] = Field( + default=None, + metadata={ + "help": "The target modules to optimize, i.e. the module names that you would like to train." + }, + ) torchdistx_path: Optional[str] = None lr_scheduler: Optional[SchedulerType] = None lr_scheduler_kwargs: Optional[Dict[str, Any]] = None @@ -416,6 +427,7 @@ class Config: datasets: Optional[conlist(Union[SFTDataset, DPODataset], min_length=1)] = None # type: ignore test_datasets: Optional[conlist(Union[SFTDataset, DPODataset], min_length=1)] = None # type: ignore + shuffle_merged_datasets: Optional[bool] = True dataset_prepared_path: Optional[str] = None dataset_shard_num: Optional[int] = None dataset_shard_idx: Optional[int] = None diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index 2156d5997a..6c9bc68159 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -415,8 +415,11 @@ def for_d_in_datasets(dataset_configs): dataset = concatenate_datasets(datasets) if len(datasets) > 1: - LOG.info("shuffle merged datasets") - dataset = dataset.shuffle(seed=seed) + if cfg.shuffle_merged_datasets: + LOG.debug("shuffle merged datasets") + dataset = dataset.shuffle(seed=seed) + else: + LOG.debug("NOT shuffling merged datasets") dataset, _ = process_datasets_for_packing(cfg, dataset, None) @@ -819,7 +822,11 @@ def wrap_pretraining_dataset( else: encode = functools.partial(encode_pretraining, tokenizer, max_tokens) - dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size) + if cfg.shuffle_merged_datasets: + dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size) + else: + LOG.debug("NOT shuffling merged pretraining datasets") + dataset = dataset.map( encode, batched=True, diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 12c7ae9390..0a59eb2a4d 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -5,16 +5,14 @@ import math import os import types -from typing import Any, Dict, List, Optional, Tuple, Type, Union # noqa: F401 +from typing import Any, Dict, Optional, Tuple, Union # noqa: F401 import addict import bitsandbytes as bnb -import safetensors import torch import transformers from accelerate import init_empty_weights -from bitsandbytes.nn import Linear4bit, Params4bit -from fastcore.parallel import parallel +from bitsandbytes.nn import Params4bit from peft import ( LoftQConfig, PeftConfig, @@ -23,7 +21,7 @@ prepare_model_for_kbit_training, ) from peft.tuners.lora import QuantLinear -from torch import Tensor, nn +from torch import nn from transformers import ( # noqa: F401 AddedToken, AutoConfig, @@ -35,9 +33,7 @@ PreTrainedTokenizerBase, ) from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled -from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, hub -from axolotl.core.policies.auto_wrap import SUPPORTED_AUTO_WRAP_MODEL_TYPES from axolotl.models.mamba import fix_mamba_attn_for_loss from axolotl.monkeypatch.multipack import ( SUPPORTED_MULTIPACK_MODEL_TYPES, @@ -271,117 +267,6 @@ def load_tokenizer(cfg): return tokenizer -def replace_linear( - model: nn.Module, - linear_replacement: Type[nn.Module], - quant_config: Union[dict, None] = None, - skip_modules=None, - **kwargs, -): - """ - Replace linear modules with a new Linear module. - Parameters: - model (`torch.nn.Module`): - Input model or `torch.nn.Module` as the function is run recursively. - linear_replacement (`torch.nn.Module`): - The linear module that replaces the old one. Only expects standard arguments. - If other arguments need to be passed, use a lambda. - skip_modules (`List[str]`, *optional*, defaults to `lm_head`): - List of modules names not to convert. Defaults to `lm_head`. - """ - if skip_modules is None: - skip_modules = ["lm_head"] - for name, module in model.named_children(): - if len(list(module.children())) > 0: - replace_linear( - module, linear_replacement, quant_config, skip_modules, **kwargs - ) - - if isinstance(module, torch.nn.Linear) and name not in skip_modules: - if issubclass(linear_replacement, Linear4bit): - model._modules[ # pylint: disable=protected-access - name - ] = linear_replacement( - module.in_features, - module.out_features, - module.bias is not None, - **kwargs, - ) - else: - raise ValueError( - f"Unsupported linear replacement: {type(linear_replacement)}" - ) - return model - - -def load_and_quantize( - module: nn.Module, - name: str, - value: Tensor, - device: torch.device = None, - dtype: torch.dtype = None, - skip_names: Optional[List[str]] = None, - is_meta_rank: bool = False, - low_memory: bool = True, - verbose: bool = False, - quant_method: str = "bnb", -): - """ - Loads `value` tensor into submodule of `module`, optionally skipping `skip_names` and converting to `dtype`. - - Quantizes `Params4bit` on `device` then places on "cpu" if low_memory=True or "meta" if is_meta_rank=True. - """ - - if skip_names is None: - skip_names = [] - - def place_on_device(value): - if is_meta_rank: - device = "meta" - elif low_memory: - device = "cpu" - else: - device = "cuda" - return value.to(device=device, dtype=dtype) - - if any(skip_name in name for skip_name in skip_names): - if verbose: - print(f"Skipping {name} because it is in skip_names") - return - - module_key, _, value_key = name.rpartition(".") - try: - submodule = module.get_submodule(module_key) - except AttributeError as exc: - print(f"Module {module_key} not found:\n{exc}") - return - - try: - if quant_method == "bnb": - param = submodule.get_parameter(value_key) - if isinstance(param, Params4bit): - # With `sync_module_states=True`, a meta device Params4bit needs to be the same - # shape as the quantized Params4bit with an initialized quant_state. However, - # FSDP only syncs parameters and buffers, so the quant_state isn't copied. This - # workaround quantizes Params4bit to initialize quant_state on all ranks, then - # replaces Params4bit's data with a meta tensor to free memory on non-rank 0. - value = type(param)( - value.to(device=device, dtype=dtype).data, **param.__dict__ - ).cuda(device) - if is_meta_rank: - value = type(param)(value.data.to("meta"), **value.__dict__) - elif low_memory: - value = type(param)(value.data.to("cpu"), **value.__dict__) - else: - value = type(param)(place_on_device(value).data) - - except AttributeError: - # it's a buffer - value = place_on_device(value) - - setattr(submodule, value_key, value) - - def load_model( cfg: DictDefault, tokenizer: PreTrainedTokenizerBase, @@ -567,6 +452,7 @@ def load_model( "bnb_4bit_compute_dtype": cfg.torch_dtype, "bnb_4bit_use_double_quant": True, "bnb_4bit_quant_type": "nf4", + "bnb_4bit_quant_storage": torch.bfloat16, } if cfg.bnb_config_kwargs: @@ -616,78 +502,10 @@ def load_model( model_kwargs["attn_implementation"] = "eager" model_config._attn_implementation = "eager" # pylint: disable=protected-access - qlora_fsdp = ( - cfg.fsdp - and cfg.adapter == "qlora" - and model_config.model_type in SUPPORTED_AUTO_WRAP_MODEL_TYPES - ) + qlora_fsdp = cfg.fsdp and cfg.adapter == "qlora" try: - if qlora_fsdp: - if cfg.bf16 or cfg.bfloat16: - torch_dtype, compute_dtype = torch.float32, torch.bfloat16 - elif cfg.fp16 or cfg.float16: - torch_dtype, compute_dtype = torch.float32, torch.float16 - else: - torch_dtype, compute_dtype = torch.float32, torch.float16 - - with init_empty_weights(): - LOG.info("Loading model with empty weights.") - model = AutoModelForCausalLM.from_config(model_config) - model.model = replace_linear( - model.model, - Linear4bit, - compute_dtype=compute_dtype, - quant_type="nf4", - quant_storage=torch_dtype, - ) - - model.is_loaded_in_4bit = True - - # Grab the safetensors files that hold the weights - try: - idx = hub.cached_file(base_model, SAFE_WEIGHTS_INDEX_NAME) - files, _ = hub.get_checkpoint_shard_files(base_model, idx) - except OSError: - try: - # This means the model doesn't have a model.safetensors.index.json because it is not sharded - files = [] - files.append(hub.cached_file(base_model, SAFE_WEIGHTS_NAME)) - except OSError as exc: - # This means the model probably doesn't have a safetensors file - raise exc - - # Load in the weights, using our custom load_and_quantize method which quantizes Params4bit on the fly - # and then places each layer on CPU or meta if using low_memory to minimize GPU memory usage - def load_and_quantize_parallel(name_param, model, **kwargs): - name, param = name_param - load_and_quantize(model, name, param, **kwargs) - - param_count = sum((p.numel() for n, p in model.named_parameters())) - for filename in files: - weights = safetensors.torch.load_file(filename) - quant_method = "bnb" - devprops = torch.cuda.get_device_properties(torch.cuda.current_device()) - left = int(os.cpu_count() / torch.cuda.device_count()) - right = int( - 8 * (devprops.total_memory / 1e9 / 40) * (70 / (param_count / 1e9)) - ) - n_workers = min(left, right) - parallel( - load_and_quantize_parallel, - weights.items(), - n_workers=n_workers, - threadpool=True, - model=model, - dtype=torch_dtype, - device=cfg.local_rank, - skip_names=[], - is_meta_rank=(cfg.local_rank != 0), - verbose=False, - quant_method=quant_method, - ) - - elif ( + if ( model_config.model_type == "llama" and not cfg.trust_remote_code and not cfg.gptq @@ -714,32 +532,6 @@ def load_and_quantize_parallel(name_param, model, **kwargs): if cfg.flash_attn_fuse_qkv: LOG.info("patching with fused QKV") replace_llama_qkv_with_fused(model) - # elif model_type == "GPTNeoXForCausalLM" and cfg.flash_attention: - # This is a WIP, still an issue with the backward pass - # RuntimeError: grad can be implicitly created only for scalar outputs - # TODO: try config.sequence_parallel = False - # # https://github.com/HazyResearch/flash-attention/blob/40a25c8ee7465cf547b929cfa2937034e37bfce9/tests/models/test_gpt_neox.py#L12 - # # https://github.com/HazyResearch/flash-attention/tree/main/training#model-components - # # add `**kwargs` to https://github.com/HazyResearch/flash-attention/blob/40a25c8ee7465cf547b929cfa2937034e37bfce9/flash_attn/models/gpt.py#L442 - # from flash_attn.utils.pretrained import state_dict_from_pretrained - # from flash_attn.models.gpt import GPTLMHeadModel - # from flash_attn.models.gpt_neox import remap_state_dict_hf_gpt_neox, gpt_neox_config_to_gpt2_config - # from transformers import GPTNeoXConfig - # config = gpt_neox_config_to_gpt2_config(GPTNeoXConfig.from_pretrained(base_model)) - # config.use_flash_attn = True - # config.fused_bias_fc = True - # config.fused_mlp = True # GPT-NeoX-20B uses "gelu_fast" - # config.activation_function = "gelu_fast" - # config.fused_dropout_add_ln = True - # # config.residual_in_fp32 = True - # - # model: GPTLMHeadModel = GPTLMHeadModel.from_pretrained( - # base_model, - # config, - # dtype=torch_dtype, - # device=cfg.device, - # ) - # model.train() # sets to train instead of eval mode elif model_type == "MambaLMHeadModel": # FIXME this is janky at best and hacked together to make it work MambaLMHeadModel = fix_mamba_attn_for_loss() # pylint: disable=invalid-name diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index e52f35ccca..380264a7ac 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -304,6 +304,10 @@ def setup_fsdp_envs(cfg): os.environ["FSDP_OFFLOAD_PARAMS"] = "true" if cfg.fsdp_config.fsdp_sync_module_states: os.environ["FSDP_SYNC_MODULE_STATES"] = "true" + if cfg.fsdp_config.fsdp_cpu_ram_efficient_loading: + os.environ["FSDP_CPU_RAM_EFFICIENT_LOADING"] = "true" + if cfg.fsdp_config.fsdp_use_orig_params: + os.environ["FSDP_USE_ORIG_PARAMS"] = "true" if cfg.fsdp_config.fsdp_state_dict_type: os.environ["FSDP_STATE_DICT_TYPE"] = cfg.fsdp_config.fsdp_state_dict_type if cfg.fsdp_config.fsdp_transformer_layer_cls_to_wrap: diff --git a/styles.css b/styles.css new file mode 100644 index 0000000000..2ddf50c7b4 --- /dev/null +++ b/styles.css @@ -0,0 +1 @@ +/* css styles */ diff --git a/tests/e2e/test_mixtral.py b/tests/e2e/test_mixtral.py index ee6f06d875..68afcdec4a 100644 --- a/tests/e2e/test_mixtral.py +++ b/tests/e2e/test_mixtral.py @@ -77,7 +77,7 @@ def test_qlora_w_fa2(self, temp_dir): model, _ = train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta) assert ( model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype - == torch.uint8 + == torch.float32 ) assert (Path(temp_dir) / "adapter_model.bin").exists() @@ -131,7 +131,7 @@ def test_qlora_wo_fa2(self, temp_dir): model, _ = train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta) assert ( model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype - == torch.uint8 + == torch.float32 ) assert (Path(temp_dir) / "adapter_model.bin").exists() diff --git a/tests/prompt_strategies/test_sharegpt.py b/tests/prompt_strategies/test_sharegpt.py index c9290b220a..19d63eac83 100644 --- a/tests/prompt_strategies/test_sharegpt.py +++ b/tests/prompt_strategies/test_sharegpt.py @@ -62,6 +62,38 @@ def fixture_sharegpt_glaive_dataset(): ) +@pytest.fixture(name="multi_role_dataset") +def fixture_multi_role_dataset(): + return Dataset.from_list( + [ + { + "conversations": [ + { + "from": "system", + "value": "use get_weather(city) to get the weather for a city", + }, + { + "from": "human", + "value": "hello, what's the weather in New York?", + }, + { + "from": "gpt", + "value": "let me get that for you", + }, + { + "from": "tool", + "value": "get_weather(New York)", + }, + { + "from": "gpt", + "value": "the weather in New York is 70 degrees and sunny", + }, + ] + } + ] + ) + + @pytest.fixture(name="tokenizer") def fixture_tokenizer(): tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") @@ -196,3 +228,39 @@ def test_chatml_glaive(self, glaive_dataset, tokenizer): 32001, 13892, 13, 28737, 28742, 28719, 7371, 28725, 562, 315, 949, 28742, 28707, 506, 272, 21368, 298, 1820, 22447, 28723, 28705, 523, 28766, 416, 1009, 772, 28766, 28767, 32000, 28705, 13 # gpt ] # fmt: on + + def test_multi_role_dataset(self, multi_role_dataset, tokenizer): + strategy = SimpleShareGPTPromptTokenizingStrategy( + ShareGPTPrompterV2(conversation="chatml", roles={"input": ["tool"]}), + tokenizer, + False, # train_on_inputs + 2048, # sequence_len + ) + + dataset_wrapper = TokenizedPromptDataset( + strategy, multi_role_dataset, process_count=1 + ) + + input_ids = dataset_wrapper[0]["input_ids"] + # fmt: off + assert input_ids == [ + 1, # bos + 32001, 1587, 13, 1730, 625, 28730, 769, 1223, 28732, 18373, 28731, 298, 625, 272, 8086, 354, 264, 2990, 32000, 28705, 13, # system + 32001, 2188, 13, 21558, 28725, 767, 28742, 28713, 272, 8086, 297, 1450, 2726, 28804, 32000, 28705, 13, # human + 32001, 13892, 13, 895, 528, 625, 369, 354, 368, 32000, 28705, 13, # gpt + 32001, 3921, 13, 527, 28730, 769, 1223, 28732, 2972, 2726, 28731, 32000, 28705, 13, # tool + 32001, 13892, 13, 1237, 8086, 297, 1450, 2726, 349, 28705, 28787, 28734, 11182, 304, 4376, 1780, 32000, 28705, 13 # gpt + ] + # fmt: on + + labels = dataset_wrapper[0]["labels"] + # fmt: off + assert labels == [ + -100, # bos + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # system + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # human + -100, -100, 13, 895, 528, 625, 369, 354, 368, 32000, 28705, 13, # gpt + -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # tool + -100, -100, 13, 1237, 8086, 297, 1450, 2726, 349, 28705, 28787, 28734, 11182, 304, 4376, 1780, 32000, 28705, 13 # gpt + ] + # fmt: on