Pull upstream changes, fix conflict, bump version to 0.0.3

0cc4m · Jun 12, 2023 · 82fa31f · 82fa31f
2 parents 724c447 + 896da5d
commit 82fa31f
Show file tree

Hide file tree

Showing 36 changed files with 730 additions and 842 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,2 @@
+exllama_sessions
+models
diff --git a/.env b/.env
@@ -0,0 +1,4 @@
+PORT=5000
+MODEL_PATH=models/LLaMA-7B-4bit-128g  # replace with the actual model path on the host
+CONTAINER_MODEL_PATH=/app/model
+SESSIONS_PATH=./exllama_sessions
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,24 @@
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 as build
+
+ENV RUN_UID=1000
+
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive apt-get install -y ninja-build python3 python3-pip && \
+    rm -rf /var/lib/apt/lists/*
+
+# Setup user which will run the service
+RUN useradd -m -u $RUN_UID user
+USER user
+
+COPY --chown=user . /app
+
+WORKDIR /app
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt \
+    && pip install flask==2.3.2
+
+USER root
+
+STOPSIGNAL SIGINT
+ENTRYPOINT ["/bin/bash", "-c", "/app/entrypoint.sh $0 $@"]
diff --git a/README.md b/README.md
@@ -1,34 +1,26 @@
 # ExLlama
 
-A rewrite of the HF transformers implementation of Llama with the following goals, among others:
+A standalone Python/C++/CUDA implementation of Llama for use with 4-bit GPTQ weights, designed to be fast and
+memory-efficient on modern GPUs.
 
-* Designed for use with quantized weights
-* Fast and memory-efficient inference (not just attention)
-* Mapping across multiple devices
-* Built-in (multi) LoRA support
-* Companion library of funky sampling functions
+Disclaimer: The project is coming along, but it's still a work in progress!
 
-Disclaimer: This is currently a preview of a work in progress. Or maybe a proof of concept. Either way any part of it
-is subject to change.
+## Hardware requirements
 
-## Hardware/software requirements
-
-I am developing on an RTX 4090 and an RTX 3090-Ti. Both cards support the CUDA kernel, but there might be
-incompatibilities with older cards. I have no way of testing that right now.
+I am developing on an RTX 4090 and an RTX 3090-Ti. Both cards support the CUDA kernels, but there might be
+incompatibilities with older cards.
 
 ## Dependencies
 
-This list might be incomplete:
-
-* `torch` tested on 2.1.0 (nightly) with cu118
+* `torch` tested on 2.0.1 and 2.1.0 (nightly) with cu118
 * `safetensors` 0.3.1
 * `sentencepiece`
 * `ninja`
 * `flask` (only for the web UI)
 
 ## Linux/WSL prerequisites
 
-    pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
+    pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu118
 
 ## Windows prerequisites
 
@@ -69,7 +61,7 @@ I made a simple web UI for it. Like the rest of the project, it's a work in prog
 it was mostly written by ChatGPT and it will haunt your dreams. But it sort of works, and it's kinda fun, especially
 multibot mode:
 
-![_screenshot.jpg](_screenshot.jpg)
+![_screenshot.jpg](doc/_screenshot.jpg)
 
 To run it:
 
@@ -79,13 +71,55 @@ To run it:
 
 Note that sessions are stored in `~/exllama_sessions/`. 
 
+## Docker
+For security benefits and easier deployment, it is also possible to run the web UI in an isolated docker container. Note: the docker image currently only supports NVIDIA GPUs.
+
+### Requirements
+- [Docker](https://docs.docker.com/engine/install/)
+- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)
+
+It is recommended to run docker in [rootless mode](https://docs.docker.com/engine/security/rootless/).
+
+### Build
+
+The easiest way to build the docker image is using docker compose. First, set the `MODEL_PATH` and `SESSIONS_PATH` variables in the `.env` file to the actual directories on the host. Then run:
+
+```
+docker compose build
+```
+
+It is also possible to manually build the image:
+
+```
+docker build -t exllama-web
+```
+
+### Run
+
+Using docker compose:
+
+```
+docker compose up
+```
+
+The web UI can now be accessed on the host at http://localhost:5000.
+
+The configuration can be viewed in `docker-compose.yml` and changed by creating a `docker-compose.override.yml` file.
+
+Run manually: 
+
+```
+docker run --gpus all -p 5000:5000 -v <path_to_model_files>:/app/model/ --rm -it exllama-web --host 0.0.0.0:5000
+```
+
+
 ## Results so far
 
 ### New implementation
 | Model    | Size | grpsz | act             | Seq. len.            | VRAM      | Prompt     | Best    | Worst   | Ppl  |
 |----------|------|-------|-----------------|----------------------|-----------|------------|---------|---------|------|
-| Llama    | 7B   | 128   | no              | 2,048 t              | 5,194 MB  | 13,918 t/s | 168 t/s | 139 t/s | 6.45 |
-| Llama    | 13B  | 128   | no              | 2,048 t              | 9,127 MB  | 7,507 t/s  | 99 t/s  | 84 t/s  | 5.60 |
+| Llama    | 7B   | 128   | no              | 2,048 t              | 5,194 MB  | 13,918 t/s | 173 t/s | 140 t/s | 6.45 |
+| Llama    | 13B  | 128   | no              | 2,048 t              | 9,127 MB  | 7,507 t/s  | 102 t/s | 86 t/s  | 5.60 |
 | Llama    | 30B  | 128   | no              | 2,048 t              | 20,795 MB | 2,959 t/s  | 47 t/s  | 40 t/s  | 4.60 |
 | Llama    | 30B  | 128   | yes             | 2,048 t              | 20,795 MB | 2,784 t/s  | 45 t/s  | 37 t/s  | 4.55 |
 | Llama    | 30B  | 32    | yes             | 1,550 t <sup>1</sup> | 21,486 MB | 2,636 t/s  | 41 t/s  | 37 t/s  | 4.52 |
@@ -136,11 +170,11 @@ speeds are no longer current.
 
 ## Todo
 
-Moved the todo list [here](TODO.md).  
+Moved the todo list [here](doc/TODO.md).  
 
 ## Compatibility
 
-I downloaded a whole bunch of GPTQ models to test compatibility. [Here](model_compatibility.md) is the list of models
+I downloaded a whole bunch of GPTQ models to test compatibility. [Here](doc/model_compatibility.md) is the list of models
 confirmed to be working right now.
 
 ## Recent updates
@@ -171,4 +205,10 @@ on Windows.
 
 **2024-06-09**: Fused most of the self-attention step. More to come. Slight speedup already, but more importantly went
 from 69% actual CPU utilization to 37%. This should do a lot to address the bottleneck on CPUs with lower 
-single-threaded performance.
+single-threaded performance.
+
+**2024-06-10**: Docker support now! And some minor optimizations. Cleaned up the project a bit.
+
+**2024-06-11**: Added some concurrency a couple of places. It's only beneficial on the 4090, on small models where the
+cores are somewhat underutilized and the L2 cache can keep up. For the 3090 it's detrimental to performance, so it's
+disabled by default. YMMV. Use `-cs` to try it out.
diff --git a/TODO.md → doc/TODO.md b/TODO.md → doc/TODO.md
@@ -9,15 +9,16 @@
 
 ## GPU compatibility (etc.)
 
-- [ ] Support for ROCm/AMD GPUs
+- [x] Support for ROCm/AMD GPUs
+- [ ] Optimize more for ROCm
 - [ ] Test that CUDA code works on GTX 10-series and RTX 20-series at some point
 - [x] Test performance on P40 (would be a good GPU to support)
 - [ ] Improve performance on P40
 - [x] Tunable kernel parameters
 - [ ] More tunable kernel parameters
 - [x] Test on Windows
-- [ ] Easier extension loading on Windows
-- [ ] Setup instructions for Windows
+- [x] Easier extension loading on Windows
+- [x] Setup instructions for Windows
 
 ## Testing
 
@@ -34,7 +35,7 @@
 
 - [x] Support for de-quantizing select matrices at load time
 - [x] ~~Better vector-matrix multiplication for de-quantized matrices~~ (dequant was a dead end)
-- [ ] Fused QKV projection
+- [x] Fused QKV projection
 - [x] Fused MLP
 - [x] Fused RoPE
 - [x] ~~Build attention mask in CUDA rather than PyTorch~~
@@ -45,6 +46,7 @@
 - [x] Examine if scaled_dot_product_attention is actually the best attention method for single tokens (it's not)
 - [ ] Implement attention in CUDA
 - [x] Rewrite at least the quantized matmul kernel. Should be a bunch of special cases to consider
+- [x] Experiment with concurrent streams where possible (fused MLP and QKV proj.)
 
 ## Generation
 
@@ -53,7 +55,7 @@
 - [ ] Multi-token censoring/de-censoring
 - [ ] Multi-token repetition penalties
 - [ ] (Multi) LoRA support
-- [ ] Guided generation (chat with multiple bots at once, etc.)
+- [x] Guided generation (chat with multiple bots at once, etc.)
 - [ ] Multiple chat modes with prompt templates (instruct, etc.)
 
 ## Interface

diff --git a/_screenshot.jpg → doc/_screenshot.jpg b/_screenshot.jpg → doc/_screenshot.jpg
diff --git a/model_compatibility.md → doc/model_compatibility.md b/model_compatibility.md → doc/model_compatibility.md
@@ -19,6 +19,7 @@ As of **2023-05-24**, the following GPTQ models on HuggingFace all appear to be
 - TheBloke/Manticore-13B-GPTQ
 - TheBloke/medalpaca-13B-GPTQ-4bit
 - TheBloke/medalpaca-13B-GPTQ-4bit (compat version)
+- TheBloke/tulu-30B-GPTQ
 - TheBloke/vicuna-13B-1.1-GPTQ-4bit-128g
 - TheBloke/VicUnlocked-30B-LoRA-GPTQ
 - TheBloke/wizard-mega-13B-GPTQ

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,28 @@
+version: "3.9"
+name: exllama
+services:
+  web:
+    build:
+      context: .
+    command: |
+      --host 0.0.0.0:$PORT
+    env_file:
+      - .env
+    environment:
+      - CONTAINER_MODEL_PATH=$CONTAINER_MODEL_PATH
+    volumes:
+      - $MODEL_PATH:$CONTAINER_MODEL_PATH
+      - $SESSIONS_PATH:/home/user/exllama_sessions
+    ports:
+      - "$PORT:$PORT"
+    tmpfs:
+      - /tmp
+    stdin_open: true
+    tty: true
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
diff --git a/entrypoint.sh b/entrypoint.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -Eeuo pipefail
+
+# Ensure that the model path is set
+if [ -z $CONTAINER_MODEL_PATH ]; then
+  echo "Must specify model path"
+  exit 1
+fi
+
+# Ensure that bind-mounted directories are owned by the user that runs the service
+chown -R $RUN_UID:$RUN_UID $CONTAINER_MODEL_PATH
+chown -R $RUN_UID:$RUN_UID /home/user/exllama_sessions
+
+# Run service as specified (non-root) user
+exec runuser -u $(id -un $RUN_UID) -- python3 /app/webui/app.py -d $CONTAINER_MODEL_PATH $@
diff --git a/exllama/cuda_ext.py b/exllama/cuda_ext.py
@@ -11,7 +11,7 @@
 from exllama_ext import q4_matmul
 from exllama_ext import half_matmul
 from exllama_ext import half_matmul_cublas
-from exllama_ext import q4_mlp
+# from exllama_ext import q4_mlp
 from exllama_ext import rms_norm
 from exllama_ext import rope_
 from exllama_ext import rep_penalty