forked from LostRuins/koboldcpp
-
Notifications
You must be signed in to change notification settings - Fork 28
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge remote-tracking branch 'upstream/concedo'
- Loading branch information
Showing
220 changed files
with
85,452 additions
and
38,026 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,300 @@ | ||
# Benchmark | ||
name: Benchmark | ||
|
||
on: | ||
workflow_dispatch: | ||
inputs: | ||
gpu-series: | ||
description: 'Azure GPU series to run with' | ||
required: true | ||
type: choice | ||
options: | ||
- Standard_NC4as_T4_v3 | ||
- Standard_NC24ads_A100_v4 | ||
- Standard_NC80adis_H100_v5 | ||
sha: | ||
description: 'Commit SHA1 to build' | ||
required: false | ||
type: string | ||
duration: | ||
description: 'Duration of the bench' | ||
type: string | ||
default: 10m | ||
|
||
push: | ||
branches: | ||
- master | ||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] | ||
pull_request_target: | ||
types: [opened, synchronize, reopened] | ||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] | ||
schedule: | ||
- cron: '04 2 * * *' | ||
|
||
concurrency: | ||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}-${{ github.event.inputs.sha }} | ||
cancel-in-progress: true | ||
|
||
jobs: | ||
bench-server-baseline: | ||
runs-on: Standard_NC4as_T4_v3 | ||
env: | ||
RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it | ||
N_USERS: 8 | ||
DURATION: 10m | ||
|
||
strategy: | ||
matrix: | ||
model: [phi-2] | ||
ftype: [q4_0, q8_0, f16] | ||
include: | ||
- model: phi-2 | ||
ftype: q4_0 | ||
pr_comment_enabled: "true" | ||
|
||
if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }} | ||
steps: | ||
- name: Clone | ||
id: checkout | ||
uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 0 | ||
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} | ||
|
||
- name: Install python env | ||
id: pipenv | ||
run: | | ||
cd examples/server/bench | ||
python3 -m venv venv | ||
source venv/bin/activate | ||
pip install -r requirements.txt | ||
- name: Prometheus | ||
id: install_prometheus | ||
run: | | ||
wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz | ||
tar xzf prometheus*.tar.gz --strip-components=1 | ||
./prometheus --config.file=examples/server/bench/prometheus.yml & | ||
while ! nc -z localhost 9090; do | ||
sleep 0.1 | ||
done | ||
- name: Set up Go | ||
uses: actions/setup-go@v5 | ||
with: | ||
go-version: '1.21' | ||
|
||
- name: Install k6 and xk6-sse | ||
id: k6_installation | ||
run: | | ||
cd examples/server/bench | ||
go install go.k6.io/xk6/cmd/xk6@latest | ||
xk6 build master \ | ||
--with github.com/phymbert/xk6-sse | ||
- name: Build | ||
id: cmake_build | ||
run: | | ||
set -eux | ||
mkdir build | ||
cd build | ||
cmake .. \ | ||
-DLLAMA_NATIVE=OFF \ | ||
-DLLAMA_BUILD_SERVER=ON \ | ||
-DLLAMA_CURL=ON \ | ||
-DLLAMA_CUBLAS=ON \ | ||
-DCUDAToolkit_ROOT=/usr/local/cuda \ | ||
-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ | ||
-DCMAKE_CUDA_ARCHITECTURES=75 \ | ||
-DLLAMA_FATAL_WARNINGS=OFF \ | ||
-DLLAMA_ALL_WARNINGS=OFF \ | ||
-DCMAKE_BUILD_TYPE=Release; | ||
cmake --build . --config Release -j $(nproc) --target server | ||
- name: Download the dataset | ||
id: download_dataset | ||
run: | | ||
cd examples/server/bench | ||
wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | ||
- name: Server bench | ||
id: server_bench | ||
run: | | ||
set -eux | ||
cd examples/server/bench | ||
source venv/bin/activate | ||
python bench.py \ | ||
--runner-label ${{ env.RUNNER_LABEL }} \ | ||
--name ${{ github.job }} \ | ||
--branch ${{ github.head_ref || github.ref_name }} \ | ||
--commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \ | ||
--scenario script.js \ | ||
--duration ${{ github.event.inputs.duration || env.DURATION }} \ | ||
--hf-repo ggml-org/models \ | ||
--hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \ | ||
--model-path-prefix /models \ | ||
--parallel ${{ env.N_USERS }} \ | ||
-ngl 33 \ | ||
--batch-size 2048 \ | ||
--ubatch-size 256 \ | ||
--ctx-size 16384 \ | ||
--n-prompts 1000 \ | ||
--max-prompt-tokens 1024 \ | ||
--max-tokens 2048 | ||
cat results.github.env >> $GITHUB_ENV | ||
# Remove dataset as we do not want it in the artefact | ||
rm ShareGPT_V3_unfiltered_cleaned_split.json | ||
- uses: actions/upload-artifact@v4 | ||
with: | ||
name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} | ||
compression-level: 9 | ||
path: | | ||
examples/server/bench/*.jpg | ||
examples/server/bench/*.json | ||
examples/server/bench/*.log | ||
- name: Commit status | ||
uses: Sibz/github-status-action@v1 | ||
with: | ||
authToken: ${{secrets.GITHUB_TOKEN}} | ||
sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }} | ||
context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} | ||
description: | | ||
${{ env.BENCH_RESULTS }} | ||
state: 'success' | ||
|
||
- name: Upload benchmark images | ||
uses: devicons/[email protected] | ||
continue-on-error: true # Important as it looks unstable: 503 | ||
id: imgur_step | ||
with: | ||
client_id: ${{secrets.IMGUR_CLIENT_ID}} | ||
path: | | ||
examples/server/bench/prompt_tokens_seconds.jpg | ||
examples/server/bench/predicted_tokens_seconds.jpg | ||
examples/server/bench/kv_cache_usage_ratio.jpg | ||
examples/server/bench/requests_processing.jpg | ||
- name: Extract mermaid | ||
id: set_mermaid | ||
run: | | ||
set -eux | ||
cd examples/server/bench | ||
PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid) | ||
echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV | ||
echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV | ||
echo "EOF" >> $GITHUB_ENV | ||
PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid) | ||
echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV | ||
echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV | ||
echo "EOF" >> $GITHUB_ENV | ||
KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid) | ||
echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV | ||
echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV | ||
echo "EOF" >> $GITHUB_ENV | ||
REQUESTS_PROCESSING=$(cat requests_processing.mermaid) | ||
echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV | ||
echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV | ||
echo "EOF" >> $GITHUB_ENV | ||
- name: Extract image url | ||
id: extract_image_url | ||
continue-on-error: true | ||
run: | | ||
set -eux | ||
echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV | ||
echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV | ||
echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV | ||
echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV | ||
- name: Comment PR | ||
uses: mshick/add-pr-comment@v2 | ||
id: comment_pr | ||
if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }} | ||
with: | ||
message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} | ||
message: | | ||
<p align="center"> | ||
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 | ||
</p> | ||
<details> | ||
<summary>Expand details for performance related PR only</summary> | ||
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }} | ||
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} | ||
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s | ||
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s | ||
- ${{ env.BENCH_GRAPH_XLABEL }} | ||
<p align="center"> | ||
<img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" /> | ||
<details> | ||
<summary>More</summary> | ||
```mermaid | ||
${{ env.PROMPT_TOKENS_SECONDS }} | ||
``` | ||
</details> | ||
<img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/> | ||
<details> | ||
<summary>More</summary> | ||
```mermaid | ||
${{ env.PREDICTED_TOKENS_SECONDS }} | ||
``` | ||
</details> | ||
</p> | ||
<details> | ||
<summary>Details</summary> | ||
<p align="center"> | ||
<img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" /> | ||
<details> | ||
<summary>More</summary> | ||
```mermaid | ||
${{ env.KV_CACHE_USAGE_RATIO }} | ||
``` | ||
</details> | ||
<img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/> | ||
<details> | ||
<summary>More</summary> | ||
```mermaid | ||
${{ env.REQUESTS_PROCESSING }} | ||
``` | ||
</details> | ||
</p> | ||
</details> | ||
</details> |
Oops, something went wrong.