diff --git a/.dockerignore b/.dockerignore
index de70e0d16772..bfbb41f4fe53 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,2 +1,8 @@
+.idea
+ci
+clients
+.github
+python
 **/target
 **/node_modules
+website
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 87b0d03f8099..14171731eb8a 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -26,8 +26,7 @@ jobs:
     container: ${{ matrix.container }}
     strategy:
       matrix:
-        binary: [aarch64-apple-darwin, x86_64-manylinux2014, x86_64-manylinux2014-cuda117,
-                 x86_64-windows-msvc-cuda117, x86_64-windows-msvc-cuda122]
+        binary: [aarch64-apple-darwin, x86_64-manylinux2014, x86_64-manylinux2014-cuda117, x86_64-windows-msvc-cuda117, x86_64-windows-msvc-cuda122, x86_64-manylinux2014-rocm57]
         include:
           - os: macos-latest
             target: aarch64-apple-darwin
@@ -53,6 +52,11 @@ jobs:
             ext: .exe
             build_args: --features cuda
             windows_cuda: '12.2.0'
+          - os: ubuntu-latest
+            target: x86_64-unknown-linux-gnu
+            binary: x86_64-manylinux2014-rocm57
+            container: ghcr.io/cromefire/hipblas-manylinux/2014/5.7:latest
+            build_args: --features rocm
 
     env:
       SCCACHE_GHA_ENABLED: true
@@ -72,7 +76,8 @@ jobs:
           target: ${{ matrix.target }}
           components: clippy
 
-      - run: rustup default ${{ env.RUST_TOOLCHAIN }}
+      - name: Set default rust version
+        run: rustup default ${{ env.RUST_TOOLCHAIN }}
 
       - name: Sccache cache
         uses: mozilla-actions/sccache-action@v0.0.3
diff --git a/Dockerfile b/Dockerfile
index 711d886fe591..fd808683f2c0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -29,12 +29,13 @@ RUN curl https://sh.rustup.rs -sSf | bash -s -- --default-toolchain ${RUST_TOOLC
 ENV PATH="/root/.cargo/bin:${PATH}"
 
 WORKDIR /root/workspace
-COPY . .
 
 RUN mkdir -p /opt/tabby/bin
 RUN mkdir -p /opt/tabby/lib
 RUN mkdir -p target
 
+COPY . .
+
 RUN --mount=type=cache,target=/usr/local/cargo/registry \
     --mount=type=cache,target=/root/workspace/target \
     cargo build --features cuda --release --package tabby && \
diff --git a/website/docs/extensions/troubleshooting.md b/website/docs/extensions/troubleshooting.md
index 94f42a667f5c..bf4ee1cc9be4 100644
--- a/website/docs/extensions/troubleshooting.md
+++ b/website/docs/extensions/troubleshooting.md
@@ -112,9 +112,9 @@ for the current code context.
 If your completion requests are timing out, Tabby may display a warning message. 
 This could be due to network issues or poor server performance, especially when 
 running a large model on a CPU. To improve performance, consider running the model 
-on a GPU with CUDA support or on Apple M1/M2 with Metal support. When running 
-the server, make sure to specify the device in the arguments using  `--device cuda` 
-or `--device metal`. You can also try using a smaller model from the available [models](https://tabby.tabbyml.com/docs/models/). 
+on a GPU with CUDA or ROCm support or on Apple M1/M2 with Metal support. When running 
+the server, make sure to specify the device in the arguments using  `--device cuda`, `--device rocm` or
+`--device metal`. You can also try using a smaller model from the available [models](https://tabby.tabbyml.com/docs/models/). 
 
 By default, the timeout for automatically triggered completion requests is set to 4 seconds. 
 You can adjust this timeout value in the `~/.tabby-client/agent/config.toml` configuration file.
diff --git a/website/docs/faq.mdx b/website/docs/faq.mdx
index 3dc6ecb632fc..031f8555fe12 100644
--- a/website/docs/faq.mdx
+++ b/website/docs/faq.mdx
@@ -1,10 +1,11 @@
-import CodeBlock from '@theme/CodeBlock';
-
 # ⁉️ Frequently Asked Questions
 
 <details>
   <summary>How much VRAM a LLM model consumes?</summary>
-  <div>By default, Tabby operates in int8 mode with CUDA, requiring approximately 8GB of VRAM for CodeLlama-7B.</div>
+    <div>
+        <p>By default, Tabby operates in int8 mode with CUDA, requiring approximately 8GB of VRAM for CodeLlama-7B.</p>
+        <p>For ROCm the actual limits are currently largely untested, but the same CodeLlama-7B seems to use 8GB of VRAM as well on a AMD Radeon™ RX 7900 XTX according to the ROCm monitoring tools.</p>
+    </div>
 </details>
 
 <details>
@@ -24,7 +25,17 @@ import CodeBlock from '@theme/CodeBlock';
 <details>
   <summary>How to utilize multiple NVIDIA GPUs?</summary>
   <div>
-    <p>Tabby only supports the use of a single GPU. To utilize multiple GPUs, you can initiate multiple Tabby instances and set CUDA_VISIBLE_DEVICES accordingly.</p>
+    <p>Tabby only supports the use of a single GPU. To utilize multiple GPUs, you can initiate multiple Tabby instances and set CUDA_VISIBLE_DEVICES (for cuda) or HIP_VISIBLE_DEVICES (for rocm) accordingly.</p>
+  </div>
+</details>
+
+<details>
+  <summary>My AMD ROCm device isn't supported by ROCm</summary>
+  <div>
+    <p>
+      You can use the HSA_OVERRIDE_GFX_VERSION variable if there is a similar GPU that is supported by ROCm you can set it to that.
+      For example for RDNA2 you can set it to 10.3.0 and to 11.0.0 for RDNA3.
+    </p>
   </div>
 </details>
 
diff --git a/website/docs/installation/apple.md b/website/docs/installation/apple.md
index 90bd2f6de1b5..8fed35ce4edb 100644
--- a/website/docs/installation/apple.md
+++ b/website/docs/installation/apple.md
@@ -14,4 +14,4 @@ brew install tabbyml/tabby/tabby
 tabby serve --device metal --model TabbyML/StarCoder-1B
 ```
 
-The compute power of M1/M2 is limited and is likely to be sufficient only for individual usage. If you require a shared instance for a team, we recommend considering Docker hosting with CUDA. You can find more information about Docker [here](./docker).
+The compute power of M1/M2 is limited and is likely to be sufficient only for individual usage. If you require a shared instance for a team, we recommend considering Docker hosting with CUDA or ROCm. You can find more information about Docker [here](./docker).