diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh
new file mode 100644
index 000000000..2e5adcd6a
--- /dev/null
+++ b/.github/scripts/build.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+python_executable=python$1
+cuda_home=/usr/local/cuda-$2
+
+# Update paths
+PATH=${cuda_home}/bin:$PATH
+LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
+
+# Install requirements
+$python_executable -m pip install wheel packaging
+
+# Limit the number of parallel jobs to avoid OOM
+export MAX_JOBS=2
+# Make sure release wheels are built for the following architectures
+export TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0+PTX"
+# Build
+$python_executable setup.py bdist_wheel --dist-dir=dist
diff --git a/.github/scripts/create_release.js b/.github/scripts/create_release.js
new file mode 100644
index 000000000..475742118
--- /dev/null
+++ b/.github/scripts/create_release.js
@@ -0,0 +1,20 @@
+// Uses Github's API to create the release and wait for result.
+// We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
+
+module.exports = async (github, context, core) => {
+	try {
+		const response = await github.rest.repos.createRelease({
+			draft: false,
+			generate_release_notes: true,
+			name: process.env.RELEASE_TAG,
+			owner: context.repo.owner,
+			prerelease: true,
+			repo: context.repo.repo,
+			tag_name: process.env.RELEASE_TAG,
+		});
+
+		core.setOutput('upload_url', response.data.upload_url);
+	} catch (error) {
+		core.setFailed(error.message);
+	}
+}
\ No newline at end of file
diff --git a/.github/scripts/cuda-install.sh b/.github/scripts/cuda-install.sh
new file mode 100644
index 000000000..312c6e82f
--- /dev/null
+++ b/.github/scripts/cuda-install.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# Replace '.' with '-' ex: 11.8 -> 11-8
+cuda_version=$(echo $1 | tr "." "-")
+# Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
+OS=$(echo $2 | tr -d ".\-")
+
+# Installs CUDA
+wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
+rm cuda-keyring_1.1-1_all.deb
+sudo apt -qq update
+sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
+sudo apt clean
+
+# Test nvcc
+PATH=/usr/local/cuda-$1/bin:${PATH}
+nvcc --version
+
+# Log gcc, g++, c++ versions
+gcc --version
+g++ --version
+c++ --version
diff --git a/.github/scripts/env.sh b/.github/scripts/env.sh
new file mode 100644
index 000000000..d7baaecbb
--- /dev/null
+++ b/.github/scripts/env.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+# This file installs common linux environment tools
+
+export LANG C.UTF-8
+
+# python_version=$1
+
+sudo    apt-get update && \
+sudo    apt-get install -y --no-install-recommends \
+        software-properties-common \
+
+sudo    apt-get install -y --no-install-recommends \
+        build-essential \
+        apt-utils \
+        ca-certificates \
+        wget \
+        git \
+        vim \
+        libssl-dev \
+        curl \
+        unzip \
+        unrar \
+        cmake \
+        net-tools \
+        sudo \
+        autotools-dev \
+        rsync \
+        jq \
+        openssh-server \
+        tmux \
+        screen \
+        htop \
+        pdsh \
+        openssh-client \
+        lshw \
+        dmidecode \
+        util-linux \
+        automake \
+        autoconf \
+        libtool \
+        net-tools \
+        pciutils \
+        libpci-dev \
+        libaio-dev \
+        libcap2 \
+        libtinfo5 \
+        fakeroot \
+        devscripts \
+        debhelper \
+        nfs-common
+
+# Remove github bloat files to free up disk space
+sudo rm -rf "/usr/local/share/boost"
+sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+sudo rm -rf "/usr/share/dotnet"
diff --git a/.github/scripts/pytorch-install.sh b/.github/scripts/pytorch-install.sh
new file mode 100644
index 000000000..dfc1851d7
--- /dev/null
+++ b/.github/scripts/pytorch-install.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+python_executable=python$1
+pytorch_version=$2
+cuda_version=$3
+
+# Install torch
+$python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
+$python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
+
+# Print version information
+$python_executable --version
+$python_executable -c "import torch; print('PyTorch:', torch.__version__)"
+$python_executable -c "import torch; print('CUDA:', torch.version.cuda)"
+$python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index c3a5c659d..55e1f2f30 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -1,206 +1,99 @@
-# This workflow will:
-# - Create a new Github release
-# - Build wheels for supported architectures
-# - Deploy the wheels to the Github release
-# - Release the static code to PyPi
-# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+# This workflow will upload a Python Package to Release asset
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions
 
-name: Build wheels and deploy
+name: Create Release
 
 on:
-  create:
+  push:
     tags:
       - v*
 
-jobs:
+# Needed to create release and upload assets
+permissions:
+  contents: write
 
-  setup_release:
+jobs:
+  release:
+    # Retrieve tag and create release
     name: Create Release
     runs-on: ubuntu-latest
+    outputs:
+      upload_url: ${{ steps.create_release.outputs.upload_url }}
     steps:
-      - name: Get the tag version
-        id: extract_branch
-        run: echo ::set-output name=branch::${GITHUB_REF#refs/tags/}
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Extract branch info
         shell: bash
+        run: |
+          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
 
       - name: Create Release
         id: create_release
-        uses: actions/create-release@v1
+        uses: "actions/github-script@v6"
         env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          RELEASE_TAG: ${{ env.release_tag }}
         with:
-          tag_name: ${{ steps.extract_branch.outputs.branch }}
-          release_name: ${{ steps.extract_branch.outputs.branch }}
+          github-token: "${{ secrets.GITHUB_TOKEN }}"
+          script: |
+            const script = require('.github/workflows/scripts/create_release.js')
+            await script(github, context, core)
 
-  build_wheels:
+  wheel:
     name: Build Wheel
-    needs: setup_release
     runs-on: ${{ matrix.os }}
+    needs: release
 
     strategy:
       fail-fast: false
       matrix:
-          # Using ubuntu-20.04 instead of 22.04 for more compatibility (glibc). Ideally we'd use the
-          # manylinux docker image, but I haven't figured out how to install CUDA on manylinux.
-          os: [ubuntu-20.04]
+          os: ['ubuntu-20.04']
           python-version: ['3.8', '3.9', '3.10', '3.11']
-          torch-version: ['2.3.0']
-          cuda-version: ['11.8.0', '12.1.1']
-          # We need separate wheels that either uses C++11 ABI (-D_GLIBCXX_USE_CXX11_ABI) or not.
-          # Pytorch wheels currently don't use it, but nvcr images have Pytorch compiled with C++11 ABI.
-          # Without this we get import error (undefined symbol: _ZN3c105ErrorC2ENS_14SourceLocationESs)
-          # when building without C++11 ABI and using it on nvcr images.
-          cxx11_abi: ['FALSE', 'TRUE']
+          pytorch-version: ['2.3.0']  # Must be the most recent version that meets requirements-cuda.txt.
+          cuda-version: ['11.8', '12.1']
 
     steps:
       - name: Checkout
         uses: actions/checkout@v3
 
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: ${{ matrix.python-version }}
+      - name: Setup ccache
+        uses: hendrikmuhs/ccache-action@v1.2
 
-      - name: Set CUDA and PyTorch versions
-        run: |
-          echo "MATRIX_CUDA_VERSION=$(echo ${{ matrix.cuda-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
-          echo "MATRIX_TORCH_VERSION=$(echo ${{ matrix.torch-version }} | awk -F \. {'print $1 "." $2'})" >> $GITHUB_ENV
-          echo "MATRIX_PYTHON_VERSION=$(echo ${{ matrix.python-version }} | awk -F \. {'print $1 $2'})" >> $GITHUB_ENV
-
-      - name: Free up disk space
+      - name: Set up Linux Env
         if: ${{ runner.os == 'Linux' }}
-        # https://github.com/easimon/maximize-build-space/blob/master/action.yml
-        # https://github.com/easimon/maximize-build-space/tree/test-report
         run: |
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf /opt/ghc
-          sudo rm -rf /opt/hostedtoolcache/CodeQL
+          bash -x .github/workflows/scripts/env.sh
 
-      - name: Set up swap space
-        if: runner.os == 'Linux'
-        uses: pierotofy/set-swap-space@v1.0
+      - name: Set up Python
+        uses: actions/setup-python@v4
         with:
-          swap-size-gb: 10
+            python-version: ${{ matrix.python-version }}
 
       - name: Install CUDA ${{ matrix.cuda-version }}
-        if: ${{ matrix.cuda-version != 'cpu' }}
-        uses: Jimver/cuda-toolkit@v0.2.14
-        id: cuda-toolkit
-        with:
-          cuda: ${{ matrix.cuda-version }}
-          linux-local-args: '["--toolkit"]'
-          # default method is "local", and we're hitting some error with caching for CUDA 11.8 and 12.1
-          # method: ${{ (matrix.cuda-version == '11.8.0' || matrix.cuda-version == '12.1.0') && 'network' || 'local' }}
-          method: 'network'
-          # We need the cuda libraries (e.g. cuSparse, cuSolver) for compiling PyTorch extensions,
-          # not just nvcc
-          # sub-packages: '["nvcc"]'
-
-      - name: Install PyTorch ${{ matrix.torch-version }}+cu${{ matrix.cuda-version }}
-        run: |
-          pip install --upgrade pip
-          # If we don't install before installing Pytorch, we get error for torch 2.0.1
-          # ERROR: Could not find a version that satisfies the requirement setuptools>=40.8.0 (from versions: none)
-          pip install lit
-          # We want to figure out the CUDA version to download pytorch
-          # e.g. we can have system CUDA version being 11.7 but if torch==1.12 then we need to download the wheel from cu116
-          # see https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix
-          # This code is ugly, maybe there's a better way to do this.
-          export TORCH_CUDA_VERSION=$(python -c "from os import environ as env; \
-            minv = {'1.12': 113, '1.13': 116, '2.0': 117, '2.1': 118, '2.2': 118, '2.3': 118}[env['MATRIX_TORCH_VERSION']]; \
-            maxv = {'1.12': 116, '1.13': 117, '2.0': 118, '2.1': 121, '2.2': 121, '2.3': 121}[env['MATRIX_TORCH_VERSION']]; \
-            print(max(min(int(env['MATRIX_CUDA_VERSION']), maxv), minv))" \
-          )
-          if [[ ${{ matrix.torch-version }} == *"dev"* ]]; then
-            if [[ ${MATRIX_TORCH_VERSION} == "2.2" ]]; then
-              # --no-deps because we can't install old versions of pytorch-triton
-              pip install typing-extensions jinja2
-              pip install --no-cache-dir --no-deps --pre https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION}/torch-${{ matrix.torch-version }}%2Bcu${TORCH_CUDA_VERSION}-cp${MATRIX_PYTHON_VERSION}-cp${MATRIX_PYTHON_VERSION}-linux_x86_64.whl
-            else
-              pip install --no-cache-dir --pre torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/nightly/cu${TORCH_CUDA_VERSION}
-            fi
-          else
-            pip install --no-cache-dir torch==${{ matrix.torch-version }} --index-url https://download.pytorch.org/whl/cu${TORCH_CUDA_VERSION}
-          fi
-          nvcc --version
-          python --version
-          python -c "import torch; print('PyTorch:', torch.__version__)"
-          python -c "import torch; print('CUDA:', torch.version.cuda)"
-          python -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
-        shell:
-          bash
-
-      - name: Build wheel
         run: |
-          # We want setuptools >= 49.6.0 otherwise we can't compile the extension if system CUDA version is 11.7 and pytorch cuda version is 11.6
-          # https://github.com/pytorch/pytorch/blob/664058fa83f1d8eede5d66418abff6e20bd76ca8/torch/utils/cpp_extension.py#L810
-          # However this still fails so I'm using a newer version of setuptools
-          pip install setuptools==68.0.0
-          pip install ninja packaging wheel
-          export PATH=/usr/local/nvidia/bin:/usr/local/nvidia/lib64:$PATH
-          export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-          # Limit MAX_JOBS otherwise the github runner goes OOM
-          MAX_JOBS=2 FLASH_ATTENTION_FORCE_BUILD="TRUE" FLASH_ATTENTION_FORCE_CXX11_ABI=${{ matrix.cxx11_abi}} python setup.py bdist_wheel --dist-dir=dist
-          tmpname=cu${MATRIX_CUDA_VERSION}torch${MATRIX_TORCH_VERSION}cxx11abi${{ matrix.cxx11_abi }}
-          wheel_name=$(ls dist/*whl | xargs -n 1 basename | sed "s/-/+$tmpname-/2")
-          ls dist/*whl |xargs -I {} mv {} dist/${wheel_name}
-          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
+          bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
 
-      - name: Log Built Wheels
+      - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
         run: |
-          ls dist
+          bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
 
-      - name: Get the tag version
-        id: extract_branch
-        run: echo ::set-output name=branch::${GITHUB_REF#refs/tags/}
-
-      - name: Get Release with tag
-        id: get_current_release
-        uses: joutvhu/get-release@v1
-        with:
-          tag_name: ${{ steps.extract_branch.outputs.branch }}
+      - name: Build wheel
+        shell: bash
         env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
+        run: |
+          bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
+          wheel_name=$(ls dist/*whl | xargs -n 1 basename)
+          asset_name=${wheel_name//"linux"/"manylinux1"}
+          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
+          echo "asset_name=${asset_name}" >> $GITHUB_ENV
 
       - name: Upload Release Asset
-        id: upload_release_asset
         uses: actions/upload-release-asset@v1
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
-          upload_url: ${{ steps.get_current_release.outputs.upload_url }}
-          asset_path: ./dist/${{env.wheel_name}}
-          asset_name: ${{env.wheel_name}}
+          upload_url: ${{ needs.release.outputs.upload_url }}
+          asset_path: ./dist/${{ env.wheel_name }}
+          asset_name: ${{ env.asset_name }}
           asset_content_type: application/*
-
-  publish_package:
-    name: Publish package
-    needs: [build_wheels]
-
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: actions/checkout@v3
-
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.10'
-
-      - name: Install dependencies
-        run: |
-          pip install ninja packaging setuptools wheel twine
-          # We don't want to download anything CUDA-related here
-          pip install torch --index-url https://download.pytorch.org/whl/cpu
-
-      - name: Build core package
-        env:
-          FLASH_ATTENTION_SKIP_CUDA_BUILD: "TRUE"
-        run: |
-          python setup.py sdist --dist-dir=dist
-
-      - name: Deploy
-        env:
-          TWINE_USERNAME: "__token__"
-          TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
-        run: |
-          python -m twine upload dist/*