.github/workflows/python-package.yml

name: Python package

on:
  push:
    branches: [ main ]
  pull_request:
    paths:
      - '.github/workflows/python-package.yml'
      - 'bitsandbytes/**'
      - 'csrc/**'
      - 'include/**'
      - 'tests/**'
      - 'CMakeLists.txt'
      - 'requirements*.txt'
      - 'setup.py'
      - 'pyproject.toml'
      - 'pytest.ini'
      - '**/*.md'
  release:
    branches: [ main ]
    types: [ published ]

concurrency:
  group: cmake-${{ github.ref }}
  cancel-in-progress: true

jobs:

  ##
  # This job matrix builds the non-CUDA versions of the libraries for all supported platforms.
  ##
  build-shared-libs:
    strategy:
      # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable.
      fail-fast: false

      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]
        arch: [x86_64, aarch64]
        build_type: [Release]
        exclude:
          - os: windows-latest # This probably requires arm64 Windows agents
            arch: aarch64
    runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
    steps:
      # Check out code
    - uses: actions/checkout@v4
    - name: Allow cross-compile on aarch64
      if: ${{ matrix.os == 'ubuntu-latest' && matrix.arch == 'aarch64' }}
      run: |
        # Allow cross-compile on aarch64
        sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu

    - name: Setup cmake
      uses: jwlawson/actions-setup-cmake@v1.14
      with:
        cmake-version: '3.26.x'
    - name: Set up MSVC
      if: startsWith(matrix.os, 'windows')
      uses: ilammy/msvc-dev-cmd@v1.13.0
      # Check out dependencies code
    - uses: actions/checkout@v4
      name: Check out NVidia cub
      with:
        repository: nvidia/cub
        ref: 1.11.0
        path: dependencies/cub

    - name: Set reusable strings
      # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file.
      id: strings
      shell: bash
      run: |
        echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT"

    - name: Prep Compilers
      shell: bash -el {0}
      run: |
        python3 -m pip install cmake==3.27.9 ninja
        if [ "${{ matrix.os }}" = "windows-latest" ]; then
          echo CXX_COMPILER=cl >> "$GITHUB_ENV"
        else
          echo CXX_COMPILER=g++ >> "$GITHUB_ENV"
        fi

        if [ "${{ matrix.os }}" = "macos-latest" -a "${{ matrix.arch }}" == "aarch64" ]; then
          echo DCMAKE_OSX_ARCHITECTURES=-DCMAKE_OSX_ARCHITECTURES=arm64 >> "$GITHUB_ENV"
        fi

    - name: Configure CPU
      run: >
        cmake -B ${{ steps.strings.outputs.build-output-dir }}
        -G Ninja ${{ env.DCMAKE_OSX_ARCHITECTURES }}
        -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }}
        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
        -DCOMPUTE_BACKEND=cpu
        -S ${{ github.workspace }}

    - name: Build CPU
      run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}

    - name: Copy libraries
      shell: bash
      run: |
        mkdir -p output/${{ matrix.os }}/${{ matrix.arch }}
        ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ )
    - name: Upload build artifact
      uses: actions/upload-artifact@v4
      with:
        name: shared_library_${{ matrix.os }}_${{ matrix.arch }}
        path: output/*

  ##
  # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Windows)
  ##
  build-shared-libs-cuda:
    strategy:
      # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable.
      fail-fast: false

      matrix:
        os: [windows-latest]
        arch: [x86_64]
        cuda-version: ['11.8.0', '12.1.1']
        build_type: [Release]
    runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
    steps:
      # Check out code
    - uses: actions/checkout@v4
      # Linux: We use Docker to build cross platform Cuda (aarch64 is built in emulation)
    - name: Setup cmake
      uses: jwlawson/actions-setup-cmake@v1.14
      with:
        cmake-version: '3.26.x'
    - name: Set up Python 3.10
      uses: actions/setup-python@v5
      with:
        python-version: "3.10"

    - name: Set up MSVC
      if: startsWith(matrix.os, 'windows')
      uses: ilammy/msvc-dev-cmd@v1.13.0
      # Check out dependencies code
    - uses: actions/checkout@v4
      name: Check out NVidia cub
      with:
        repository: nvidia/cub
        ref: 1.11.0
        path: dependencies/cub
    - name: Setup Mambaforge
      uses: conda-incubator/setup-miniconda@v3.0.1
      with:
        miniforge-variant: Mambaforge
        miniforge-version: latest
        activate-environment: bnb-env
        use-mamba: true

    - uses: conda-incubator/setup-miniconda@v3.0.1
      with:
        auto-update-conda: true
        activate-environment: bnb-env
        environment-file: environment-bnb.yml
        use-only-tar-bz2: false
        auto-activate-base: true
        python-version: "3.10"
        mamba-version: "*"

    - name: Set reusable strings
      # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file.
      id: strings
      shell: bash
      run: |
        echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT"

    - name: Allow cross-compile on aarch64
      if: ${{ matrix.os == 'ubuntu-latest' && matrix.arch == 'aarch64' }}
      run: |
        # Allow cross-compile on aarch64
        sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu

    - name: CUDA Toolkit
      shell: bash -el {0}
      run: |
        if [ "${{ matrix.os }}" = "ubuntu-latest" ]; then
            # to prepare space
            sudo rm -rf /usr/share/dotnet
            sudo rm -rf /opt/ghc
            sudo rm -rf /usr/local/share/boost
        fi
        addon=""
        cuda_version=${{ matrix.cuda-version }}
        [ "$cuda_version" = "12.1.1" ] && [ "${{ matrix.os }}" = "ubuntu-latest" ] && addon="cuda-cudart-static cuda-nvrtc"
        [ "$cuda_version" = "11.8.0" ] && [ "${{ matrix.os }}" = "windows-latest" ] && addon="cuda-nvrtc"
        [ "$cuda_version" = "11.8.0" ] && cuda__version="11.8"
        [ "$cuda_version" = "12.1.1" ] && cuda__version="12.1"

        conda install pytorch-cuda=$cuda__version -c pytorch # it's dependency not correctly resolved sometime
        conda install cuda-python=$cuda__version cuda-libraries-dev cuda-nvcc cuda-nvtx cuda-cupti cuda-cudart cuda-cudart-dev cuda-runtime cuda-libraries $addon -c "nvidia/label/cuda-$cuda_version"

        CUDA_HOME="${{ env.CONDA }}/envs/bnb-env"
        echo CUDA_HOME=$CUDA_HOME >> "$GITHUB_ENV"
        echo CUDA_PATH=$CUDA_HOME >> "$GITHUB_ENV"

        if [ "${{ matrix.os }}" = "windows-latest" ]; then
            echo CXX_COMPILER=cl >> "$GITHUB_ENV"
            # without -DCMAKE_CUDA_COMPILER=nvcc, cmake config always fail for cuda-11.8
            echo DCMAKE_CUDA_COMPILER=-DCMAKE_CUDA_COMPILER=nvcc >> "$GITHUB_ENV"
        else
            echo CXX_COMPILER=g++ >> "$GITHUB_ENV"
        fi

        nvcc --version

    - name: Update environment
      run: mamba env update -n bnb-env -f environment-bnb.yml

    - name: Prep build
      run: python -m pip install cmake==3.27.9 ninja

    # TODO: the following steps (CUDA, NOBLASLT, CPU) could be moved to the matrix, so they're built in parallel

    - name: Configure CUDA
      run: >
        cmake -B ${{ steps.strings.outputs.build-output-dir }}
        -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
        -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }}
        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
        -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90"
        -DCOMPUTE_BACKEND=cuda
        -S ${{ github.workspace }}

    - name: Build CUDA
      run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}

    - name: Configure NOBLASLT
      run: >
        cmake -B ${{ steps.strings.outputs.build-output-dir }}
        -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
        -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }}
        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
        -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90"
        -DCOMPUTE_BACKEND=cuda
        -DNO_CUBLASLT=ON
        -S ${{ github.workspace }}

    - name: Build NOBLASLT
      run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}

    - name: Copy libraries
      shell: bash
      run: |
        mkdir -p output/${{ matrix.os }}/${{ matrix.arch }}
        ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ )
    - name: Upload build artifact
      uses: actions/upload-artifact@v4
      with:
        name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda-version }}
        path: output/*

  ##
  # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64)
  ##
  build-shared-libs-cuda-docker:
    strategy:
      # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable.
      fail-fast: false

      matrix:
        os: [ubuntu-latest]
        arch: [x86_64, aarch64]
        cuda-version: ['11.8.0', '12.1.1']
        build_type: [Release]
        exclude:
          - os: windows-latest # This probably requires arm64 Windows agents
            arch: aarch64
    runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
    container:
      image: ${{ matrix.os == 'windows-latest' && 'null' || format('nvidia/cuda:{0}-{1}', matrix.cuda-version, 'devel-ubuntu22.04') }}
      volumes:
        - /home/runner/work:/home/runner/work
    steps:
      # Check out code
    - uses: actions/checkout@v4
      # Linux: We use Docker to build cross platform Cuda (aarch64 is built in emulation)
    - name: Setup cmake
      uses: jwlawson/actions-setup-cmake@v1.14
      with:
        cmake-version: '3.26.x'
    - name: Set up Python 3.10
      uses: actions/setup-python@v5
      with:
        python-version: "3.10"

    - name: Setup Docker image
      if: startsWith(matrix.os, 'ubuntu')
      shell: bash
      run: |
        apt-get update
        DEBIAN_FRONTEND=noninteractive apt-get install -y sudo cmake

    - name: Set reusable strings
      # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file.
      id: strings
      shell: bash
      run: |
        echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT"

    - name: Allow cross-compile on aarch64
      if: ${{ matrix.os == 'ubuntu-latest' && matrix.arch == 'aarch64' }}
      run: |
        # Allow cross-compile on aarch64
        sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu

    - name: CUDA Toolkit
      shell: bash -el {0}
      run: |
        cuda_version=${{ matrix.cuda-version }}
        [ "$cuda_version" = "11.8.0" ] && cuda__version="11.8"
        [ "$cuda_version" = "12.1.1" ] && cuda__version="12.1"

        CUDA_HOME="${{ env.CONDA }}/envs/bnb-env"
        echo CUDA_HOME=$CUDA_HOME >> "$GITHUB_ENV"
        echo CUDA_PATH=$CUDA_HOME >> "$GITHUB_ENV"
        echo CXX_COMPILER=g++ >> "$GITHUB_ENV"

        nvcc --version

    - name: Prep build
      run: python -m pip install cmake==3.27.9 ninja

    # TODO: the following steps (CUDA, NOBLASLT, CPU) could be moved to the matrix, so they're built in parallel

    - name: Configure CUDA
      run: >
        cmake -B ${{ steps.strings.outputs.build-output-dir }}
        -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
        -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }}
        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
        -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90"
        -DCOMPUTE_BACKEND=cuda
        -S ${{ github.workspace }}

    - name: Build CUDA
      run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}

    - name: Configure NOBLASLT
      run: >
        cmake -B ${{ steps.strings.outputs.build-output-dir }}
        -G Ninja ${{ env.DCMAKE_CUDA_COMPILER }}
        -DCMAKE_CXX_COMPILER=${{ env.CXX_COMPILER }}
        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
        -DCOMPUTE_CAPABILITY="50;52;60;61;62;70;72;75;80;86;87;89;90"
        -DCOMPUTE_BACKEND=cuda
        -DNO_CUBLASLT=ON
        -S ${{ github.workspace }}

    - name: Build NOBLASLT
      run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }}

    - name: Copy libraries
      shell: bash
      run: |
        mkdir -p output/${{ matrix.os }}/${{ matrix.arch }}
        ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ )
    - name: Upload build artifact
      uses: actions/upload-artifact@v4
      with:
        name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda-version }}
        path: output/*

  build-wheels:
    needs:
    - build-shared-libs
    - build-shared-libs-cuda
    - build-shared-libs-cuda-docker
    strategy:
      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]
        arch: [x86_64, aarch64]
        exclude:
          - os: windows-latest # This probably requires arm64 Windows agents
            arch: aarch64
    runs-on: ${{ matrix.os }}
    steps:
      # Check out code
    - uses: actions/checkout@v4
      # Download shared libraries
    - name: Download build artifact
      uses: actions/download-artifact@v4
      with:
        merge-multiple: true
        pattern: "shared_library*_${{ matrix.os }}_${{ matrix.arch }}*"
        path: output/
    - name: Copy correct platform shared library
      shell: bash
      run: |
        ls -lR output/
        cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/
      # Set up the Python version needed
    - name: Set up Python 3.10
      uses: actions/setup-python@v5
      with:
        python-version: "3.10"
        cache: pip
    - name: Install build package
      shell: bash
      run: pip install build
    - name: Install Python test dependencies
      shell: bash
      run: pip install -r requirements-ci.txt
    # TODO: How to run CUDA tests on GitHub actions?
    #- name: Run unit tests
    #  if: ${{ matrix.arch == 'x86_64' }} # Tests are too slow to run in emulation. Wait for real aarch64 agents
    #  run: |
    #    PYTHONPATH=. pytest --log-cli-level=DEBUG tests
    - name: Build wheel
      shell: bash
      run: python -m build . --wheel
    - name: Upload build artifact
      uses: actions/upload-artifact@v4
      with:
        name: bdist_wheel_${{ matrix.os }}-${{ matrix.arch }}
        path: ${{ github.workspace }}/dist/bitsandbytes-*.whl
  publish:
    needs: build-wheels
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
    - name: Download build artifact
      uses: actions/download-artifact@v4
      with:
        path: dist/
        merge-multiple: true
        pattern: "bdist_wheel_*"
    - run: |
        ls -lR dist/
    - name: Publish to PyPi
      if: startsWith(github.ref, 'refs/tags')
      uses: pypa/gh-action-pypi-publish@release/v1
      with:
        password: ${{ secrets.pypi }}